Fix optim docs
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 8f29caf..9c35d0e 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -1,114 +1,119 @@
 """
 :mod:`torch.optim` is a package for optimizing neural networks.
-It provides  a wide variety of optimization methods such as SGD, Adam etc.
+It provides a wide variety of optimization methods such as SGD, Adam etc.
 
 Currently, the following optimization methods are supported, typically with
 options such as weight decay and other bells and whistles.
 
-- SGD          `(params, lr=required, momentum=0, dampening=0)`
-- AdaDelta     `(params, rho=0.9, eps=1e-6, weight_decay=0)`
-- Adagrad      `(params, lr=1e-2, lr_decay=0, weight_decay=0)`
-- Adam         `(params, lr=1e-2, betas=(0.9, 0.999), epsilon=1e-8, weight_decay=0)`
-- AdaMax       `(params, lr=1e-2, betas=(0.9, 0.999), eps=1e-38, weight_decay=0)`
-- Averaged SGD `(params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0)`
-- RProp        `(params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50))`
-- RMSProp      `(params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0)`
+- SGD
+- AdaDelta
+- Adagrad
+- Adam
+- AdaMax
+- Averaged SGD
+- RProp
+- RMSProp
 
 
 The usage of the Optim package itself is as follows.
 
 1. Construct an optimizer
-2. Use `optimizer.step(...)` to optimize.
-   - Call `optimizer.zero_grad()` to zero out the gradient buffers when appropriate
+2. Use ``optimizer.step(...)`` to optimize.
+   - Call ``optimizer.zero_grad()`` to zero out the gradient buffers when appropriate
 
-## 1. Constructing the optimizer
+Constructing the optimizer
+--------------------------
 
-One first constructs an `Optimizer` object by giving it a list of parameters
+One first constructs an ``Optimizer`` object by giving it a list of parameters
 to optimize, as well as the optimizer options,such as learning rate, weight decay, etc.
 
-Examples:
+Examples::
 
-`optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)`
+    optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
+    optimizer = optim.Adam([var1, var2], lr = 0.0001)
 
-`optimizer = optim.Adam([var1, var2], lr = 0.0001)`
-
-### Per-parameter options
+Per-parameter options
+---------------------
 
 In a more advanced usage, one can specify per-layer options by passing each parameter group along with it's custom options.
 
-**__Any parameter group that does not have an attribute defined will use the default attributes.__**
+**Any parameter group that does not have an attribute defined will use the default attributes.**
 
 This is very useful when one wants to specify per-layer learning rates for example.
 
-Example:
+For example such invocation::
 
-`optim.SGD([{'params': model1.parameters()}, {'params': model2.parameters(), 'lr': 1e-3}, lr=1e-2, momentum=0.9)`
+    optim.SGD([
+        {'params': model1.parameters()},
+        {'params': model2.parameters(), 'lr': 1e-3}],
+        lr=1e-2, momentum=0.9)
 
-`model1`'s parameters will use the default learning rate of `1e-2` and momentum of `0.9`
-`model2`'s parameters will use a learning rate of `1e-3`, and the default momentum of `0.9`
+means that
+
+* ``model1``'s parameters will use the default learning rate of ``1e-2`` and momentum of ``0.9``
+* ``model2``'s parameters will use a learning rate of ``1e-3``, and the default momentum of ``0.9``
 
 Then, you can use the optimizer by calling `optimizer.zero_grad()` and `optimizer.step(...)`. Read the next sections.
 
-## 2. Taking an optimization step using `Optimizer.step(...)`
+Taking an optimization step using ``step``
+-------------------------------------------------------
 
-The step function has the following two signatures:
+``optimizer.step()``
+^^^^^^^^^^^^^^^^^^^^
 
-### a. `Optimizer.step(closure)`
+This is a simplified version supported by most optimizers.
 
-The `step` function takes a user-defined closure that computes f(x) and returns the loss.
+The function can be called after computing the gradients with ``backward()``.
 
-The closure needs to do the following:
-- Optimizer.zero_grad()
-- Compute the loss
-- Call loss.backward()
-- return the loss
+Example 2 - training a neural network::
 
-Example 1: training a neural network
+    net = MNISTNet()
+    criterion = ClassNLLLoss()
+    optimizer = optim.SGD(net.parameters(), lr=0.001)
 
-```python
-# Example 1: training a neural network with optimizer.step(closure)
-net = MNISTNet()
-criterion = ClassNLLLoss()
-optimizer = optim.SGD(net.parameters(), lr=0.001)
+    for data in data_batches:
+        input, target = data
+            optimizer.zero_grad()
+            output = net(input)
+            loss = criterion(output, target)
+            loss.backward()
+            optimizer.step()
 
-for data in data_batches:
-    input, target = data
-	def closure():
-	    optimizer.zero_grad()
-	    output = net(input)
-		loss = criterion(output, target)
-		loss.backward()
-		return loss
-	optimizer.step(closure)
-```
+The step function can be used in two ways.
 
-Notes: Why is this required? Why cant we simply have the optimizer take the parameters and grads?
-       Some optimization algorithms such as Conjugate Gradient and LBFGS need to evaluate their function
-	   multiple times. For such optimization methods, the function (i.e. the closure) has to be defined.
+``optimizer.step(closure)``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+The ``step`` function takes a user-defined closure that computes f(x) and returns the loss.
 
-### b. `Optimizer.step()`
+The closure should look somewhat like this::
 
-This is a simplified usage that supports most, but not all optimization algorithms. For example, it does not support LBFGS or Conjugate Gradient.
+    def f_closure(x):
+        optimizer.zero_grad()
+        loss = f(x)
+        loss.backward()
+        return loss
 
-The usage for this is to simply call the function after the backward() is called on your model.
+Example 1 - training a neural network::
 
-Example 2: training a neural network
+    net = MNISTNet()
+    criterion = ClassNLLLoss()
+    optimizer = optim.SGD(net.parameters(), lr=0.001)
 
-```python
-# Example 2: training a neural network with optimizer.step()
-net = MNISTNet()
-criterion = ClassNLLLoss()
-optimizer = optim.SGD(net.parameters(), lr=0.001)
+    for data in data_batches:
+        input, target = data
+            def closure():
+                optimizer.zero_grad()
+                output = net(input)
+                    loss = criterion(output, target)
+                    loss.backward()
+                    return loss
+            optimizer.step(closure)
 
-for data in data_batches:
-    input, target = data
-	optimizer.zero_grad()
-	output = net(input)
-	loss = criterion(output, target)
-	loss.backward()
-	optimizer.step()
-```
+Note:
+    **Why is this supported?**
+    Some optimization algorithms such as Conjugate Gradient and LBFGS need to evaluate their function
+    multiple times. For such optimization methods, the function (i.e. the closure) has to be defined.
 """
 
 from .adadelta import Adadelta
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 9d7e380..3d57333 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -2,13 +2,13 @@
 
 
 class SGD(Optimizer):
-    """Implements stochastic gradient descent with optional momentum.
+    """Implements stochastic gradient descent (optionally with momentum).
 
     Args:
-        params: (sequence) parameters to optimize
-        lr: (float) learning rate
-        momentum: (float) momentum factor (default: 0)
-        weight_decay: (float) weight decay (L2 penalty) (default: 0)
+        params (sequence): parameters to optimize
+        lr (float): learning rate
+        momentum (float): momentum factor (default: 0)
+        weight_decay (float): weight decay (L2 penalty) (default: 0)
     Example:
         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
         >>> def closure():