Add throughput information to resnet50_trainer

Summary:
TSIA

Makes it easier for throughput debugging.

Differential Revision: D4879634

fbshipit-source-id: 8d479d51b0ec51ad3d86ad5500fc3095400cf095
diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py
index fe2eb3f..cdbc118 100644
--- a/caffe2/python/examples/resnet50_trainer.py
+++ b/caffe2/python/examples/resnet50_trainer.py
@@ -8,6 +8,7 @@
 import argparse
 import logging
 import numpy as np
+import time
 
 from caffe2.python import core, workspace, experiment_util, data_parallel_model, dyndep
 from caffe2.python import timeout_guard, cnn
@@ -96,17 +97,20 @@
     TODO: add checkpointing here.
     '''
     # TODO: add loading from checkpoint
+    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
     epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
     for i in range(epoch_iters):
-        log.info("Start iteration {}/{} of epoch {}".format(
-            i, epoch_iters, epoch,
-        ))
-
         # This timeout is required (temporarily) since CUDA-NCCL
         # operators might deadlock when synchronizing between GPUs.
         timeout = 600.0 if i == 0 else 60.0
         with timeout_guard.CompleteInTimeOrDie(timeout):
+            t1 = time.time()
             workspace.RunNet(train_model.net.Proto().name)
+            t2 = time.time()
+            dt = t2 - t1
+
+        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
+        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
 
     num_images = epoch * epoch_iters * total_batch_size
     prefix = "gpu_{}".format(train_model._devices[0])