Expend autograd profiler docs (#3621)

diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
index 14fae97..032dd42 100644
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@@ -56,6 +56,12 @@
 Profiler
 --------
 
+Autograd includes a profiler that lets you inspect the cost of different
+operators inside your model - both on the CPU and GPU. There are two modes
+implemented at the moment - CPU-only using :class:`~torch.autograd.profiler.profile`.
+and nvprof based (registers both CPU and GPU activity) using
+:class:`~torch.autograd.profiler.emit_nvtx`.
+
 .. autoclass:: torch.autograd.profiler.profile
     :members:
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 5f1552a..5afb137 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -29,6 +29,17 @@
         return self.table()
 
     def table(self, sort_by=None):
+        """Prints an EventList as a nicely formatted table.
+
+        Arguments:
+            sort_by (str, optional): Attribute used to sort entries. By default
+                they are printed in the same order as they were registered.
+                Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
+                ``cuda_time_total``, ``count``.
+
+        Returns:
+            A string containing the table.
+        """
         return build_table(self, sort_by)
 
     def export_chrome_trace(self, path):
@@ -143,6 +154,12 @@
             return '<unfinished torch.autograd.profile>'
         return str(self.function_events)
 
+    def table(self, sort_by=None):
+        if self.function_events is None:
+            raise RuntimeError("can't export a trace that didn't finish running")
+        return self.function_events.table(sort_by)
+    table.__doc__ = EventList.table.__doc__
+
     def export_chrome_trace(self, path):
         if self.function_events is None:
             raise RuntimeError("can't export a trace that didn't finish running")
@@ -165,13 +182,19 @@
 class emit_nvtx(object):
     """Context manager that makes every autograd operation emit an NVTX range.
 
-    It is useful when running the program under nvprof. Unfortunately, there's no
-    way to force nvprof to flush the data it collected to disk, so for CUDA profiling
-    one has to use this context manager to annotate nvprof traces, and then use
-    :func:`torch.autograd.profiler.open_nvtx` to analyze the checkpoint.
+    It is useful when running the program under nvprof::
+
+        nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+
+    Unfortunately, there's no way to force nvprof to flush the data it collected
+    to disk, so for CUDA profiling one has to use this context manager to annotate
+    nvprof traces and wait for the process to exit before inspecting them.
+    Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or
+    :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection
+    e.g. in Python REPL.
 
     .. warning:
-        This context managers should not be called recursively, i.e. at most one
+        This context manager should not be called recursively, i.e. at most one
         instance should be enabled at any given time.
 
     Arguments: