tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py - platform/external/tensorflow - Git at Google

 # Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Benchmarks for LossScaleOptimizer and LossScaleGradientTape."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import time

 from tensorflow.python.client import session as session_module
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt_module


 def _get_strategy(num_gpus):
   if num_gpus > 1:
     return mirrored_strategy.MirroredStrategy(
         ['/GPU:%d' % i for i in range(num_gpus)])
   else:
     return distribution_strategy_context.get_strategy()  # The default strategy


 class LossScaleBenchmark(test.Benchmark):
   """Benchmark for loss scaling."""

   def _benchmark(self, gradient_type, num_gpus, mode, loss_scaling):
     """Benchmarks loss scaling.

     We run a simple model with several scalar variables. The loss is the sum of
     all variables. The model is simple because we want to measure only the
     performance of loss scaling, not the performance of the model itself.

     Args:
       gradient_type: "optimizer" or "gradient_tape". How gradients are computed.
         "optimizer" uses Optimizer.minimize. "gradient_tape" uses
         GradientTape.gradient.
       num_gpus: The number of GPUs to use. Must be at least 1.
       mode: "eager", "tf_function", or "graph". "eager" means to use eager mode.
         "tf_function" means to use eager mode where all computations are wrapped
         in a tf.function. "graph" means to use TensorFlow 1's graph mode with a
         tf.compat.v1.Session. "graph" is unsupported with a
         LossScaleGradientTape.
       loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to
         use. None means use no loss scaling, which is useful as a baseline to
         see how much slower loss scaling is in comparison.
     """
     if mode == 'graph':
       graph = ops.Graph()
       ctx_mgr = graph.as_default()
     elif mode == 'eager':
       ctx_mgr = context.eager_mode()
     else:
       assert mode == 'tf_function'
       ctx_mgr = context.eager_mode()
     ls_str = loss_scaling or 'no_loss_scaling'
     name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str)
     with ctx_mgr, _get_strategy(num_gpus).scope() as strategy:
       opt = adam.Adam()
       if loss_scaling == 'fixed':
         loss_scale = loss_scale_module.FixedLossScale(2.)
       elif loss_scaling == 'dynamic':
         # Make increment_period so high that it's effectively infinite. This
         # means the loss scale will never change. Any performance overhead
         # from increasing/decreasing the loss scale is typically negligible
         # since it happens infrequently, so we only benchmark the common case
         # of the loss scale not changing.
         increment_period = 1000000
         loss_scale = loss_scale_module.DynamicLossScale(
             initial_loss_scale=2., increment_period=increment_period)
       else:
         assert loss_scaling is None
         loss_scale = None

       num_vars = 200
       num_warmup_iters = 1
       num_iters = 20
       # By using scalar variables, we reduce overhead of the actual GPU work of
       # multiplying variables, dividing gradients, and checking gradients for
       # NaNs. Measuring these overheads isn't very useful as there is little we
       # can do to reduce them (one such way would be to fuse dividing gradients
       # and checking them for NaNs). We still have all other overheads, such as
       # all-reducing the `is_finite` values and having a tf.cond or
       # tf.while_loop based on whether gradients are NaNs. Currently, these
       # other overheads are much more significant than the GPU work.
       var_list = [
           variables.Variable(i, dtype='float32') for i in range(num_vars)]

       def get_loss():
         return math_ops.add_n(var_list)

       if gradient_type == 'gradient_tape':
         tape_cls = ((lambda: lsgt_module.LossScaleGradientTape(loss_scale))
                     if loss_scale else backprop.GradientTape)
         def minimize_fn():
           with tape_cls() as tape:
             loss = get_loss()
           grads = tape.gradient(loss, var_list)
           return opt.apply_gradients(zip(grads, var_list))
       else:
         assert gradient_type == 'optimizer'
         if loss_scale:
           opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
         def minimize_fn():
           return opt.minimize(get_loss, var_list)

       if mode == 'graph':
         run_op = strategy.experimental_run_v2(minimize_fn)
         init_op = variables.global_variables_initializer()
         with session_module.Session() as sess:
           sess.run(init_op)
           self.run_op_benchmark(sess, run_op, min_iters=num_iters,
                                 burn_iters=num_warmup_iters, name=name)
         return

       def run_fn():
         strategy.experimental_run_v2(minimize_fn)
       if mode == 'tf_function':
         run_fn = def_function.function(run_fn)

       for _ in range(num_warmup_iters):
         run_fn()

       start = time.time()
       for _ in range(num_iters):
         run_fn()
       end = time.time()
       self.report_benchmark(iters=num_iters,
                             wall_time=(end - start) / num_iters, name=name)

   def _gpus_to_test_with(self):
     num_gpus = context.num_gpus()
     gpus_to_test_with = []
     if num_gpus >= 1:
       gpus_to_test_with.append(1)
     if num_gpus >= 2:
       gpus_to_test_with.append(2)
     if num_gpus >= 8:
       gpus_to_test_with.append(8)
     return gpus_to_test_with

   def benchmark_optimizer(self):
     for num_gpus in self._gpus_to_test_with():
       for mode in 'eager', 'tf_function', 'graph':
         for loss_scaling in None, 'fixed', 'dynamic':
           self._benchmark('optimizer', num_gpus, mode, loss_scaling)

   def benchmark_gradient_tape(self):
     for num_gpus in self._gpus_to_test_with():
       # LossScaleGradientTape doesn't support graph mode
       for mode in 'eager', 'tf_function':
         for loss_scaling in None, 'fixed', 'dynamic':
           self._benchmark('gradient_tape', num_gpus, mode, loss_scaling)


 if __name__ == '__main__':
   test.main()
	# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Benchmarks for LossScaleOptimizer and LossScaleGradientTape."""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import time

	from tensorflow.python.client import session as session_module
	from tensorflow.python.distribute import distribution_strategy_context
	from tensorflow.python.distribute import mirrored_strategy
	from tensorflow.python.eager import backprop
	from tensorflow.python.eager import context
	from tensorflow.python.eager import def_function
	from tensorflow.python.framework import ops
	from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
	from tensorflow.python.keras.optimizer_v2 import adam
	from tensorflow.python.ops import math_ops
	from tensorflow.python.ops import variables
	from tensorflow.python.platform import test
	from tensorflow.python.training.experimental import loss_scale as loss_scale_module
	from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt_module


	def _get_strategy(num_gpus):
	if num_gpus > 1:
	return mirrored_strategy.MirroredStrategy(
	['/GPU:%d' % i for i in range(num_gpus)])
	else:
	return distribution_strategy_context.get_strategy() # The default strategy


	class LossScaleBenchmark(test.Benchmark):
	"""Benchmark for loss scaling."""

	def _benchmark(self, gradient_type, num_gpus, mode, loss_scaling):
	"""Benchmarks loss scaling.

	We run a simple model with several scalar variables. The loss is the sum of
	all variables. The model is simple because we want to measure only the
	performance of loss scaling, not the performance of the model itself.

	Args:
	gradient_type: "optimizer" or "gradient_tape". How gradients are computed.
	"optimizer" uses Optimizer.minimize. "gradient_tape" uses
	GradientTape.gradient.
	num_gpus: The number of GPUs to use. Must be at least 1.
	mode: "eager", "tf_function", or "graph". "eager" means to use eager mode.
	"tf_function" means to use eager mode where all computations are wrapped
	in a tf.function. "graph" means to use TensorFlow 1's graph mode with a
	tf.compat.v1.Session. "graph" is unsupported with a
	LossScaleGradientTape.
	loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to
	use. None means use no loss scaling, which is useful as a baseline to
	see how much slower loss scaling is in comparison.
	"""
	if mode == 'graph':
	graph = ops.Graph()
	ctx_mgr = graph.as_default()
	elif mode == 'eager':
	ctx_mgr = context.eager_mode()
	else:
	assert mode == 'tf_function'
	ctx_mgr = context.eager_mode()
	ls_str = loss_scaling or 'no_loss_scaling'
	name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str)
	with ctx_mgr, _get_strategy(num_gpus).scope() as strategy:
	opt = adam.Adam()
	if loss_scaling == 'fixed':
	loss_scale = loss_scale_module.FixedLossScale(2.)
	elif loss_scaling == 'dynamic':
	# Make increment_period so high that it's effectively infinite. This
	# means the loss scale will never change. Any performance overhead
	# from increasing/decreasing the loss scale is typically negligible
	# since it happens infrequently, so we only benchmark the common case
	# of the loss scale not changing.
	increment_period = 1000000
	loss_scale = loss_scale_module.DynamicLossScale(
	initial_loss_scale=2., increment_period=increment_period)
	else:
	assert loss_scaling is None
	loss_scale = None

	num_vars = 200
	num_warmup_iters = 1
	num_iters = 20
	# By using scalar variables, we reduce overhead of the actual GPU work of
	# multiplying variables, dividing gradients, and checking gradients for
	# NaNs. Measuring these overheads isn't very useful as there is little we
	# can do to reduce them (one such way would be to fuse dividing gradients
	# and checking them for NaNs). We still have all other overheads, such as
	# all-reducing the `is_finite` values and having a tf.cond or
	# tf.while_loop based on whether gradients are NaNs. Currently, these
	# other overheads are much more significant than the GPU work.
	var_list = [
	variables.Variable(i, dtype='float32') for i in range(num_vars)]

	def get_loss():
	return math_ops.add_n(var_list)

	if gradient_type == 'gradient_tape':
	tape_cls = ((lambda: lsgt_module.LossScaleGradientTape(loss_scale))
	if loss_scale else backprop.GradientTape)
	def minimize_fn():
	with tape_cls() as tape:
	loss = get_loss()
	grads = tape.gradient(loss, var_list)
	return opt.apply_gradients(zip(grads, var_list))
	else:
	assert gradient_type == 'optimizer'
	if loss_scale:
	opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
	def minimize_fn():
	return opt.minimize(get_loss, var_list)

	if mode == 'graph':
	run_op = strategy.experimental_run_v2(minimize_fn)
	init_op = variables.global_variables_initializer()
	with session_module.Session() as sess:
	sess.run(init_op)
	self.run_op_benchmark(sess, run_op, min_iters=num_iters,
	burn_iters=num_warmup_iters, name=name)
	return

	def run_fn():
	strategy.experimental_run_v2(minimize_fn)
	if mode == 'tf_function':
	run_fn = def_function.function(run_fn)

	for _ in range(num_warmup_iters):
	run_fn()

	start = time.time()
	for _ in range(num_iters):
	run_fn()
	end = time.time()
	self.report_benchmark(iters=num_iters,
	wall_time=(end - start) / num_iters, name=name)

	def _gpus_to_test_with(self):
	num_gpus = context.num_gpus()
	gpus_to_test_with = []
	if num_gpus >= 1:
	gpus_to_test_with.append(1)
	if num_gpus >= 2:
	gpus_to_test_with.append(2)
	if num_gpus >= 8:
	gpus_to_test_with.append(8)
	return gpus_to_test_with

	def benchmark_optimizer(self):
	for num_gpus in self._gpus_to_test_with():
	for mode in 'eager', 'tf_function', 'graph':
	for loss_scaling in None, 'fixed', 'dynamic':
	self._benchmark('optimizer', num_gpus, mode, loss_scaling)

	def benchmark_gradient_tape(self):
	for num_gpus in self._gpus_to_test_with():
	# LossScaleGradientTape doesn't support graph mode
	for mode in 'eager', 'tf_function':
	for loss_scaling in None, 'fixed', 'dynamic':
	self._benchmark('gradient_tape', num_gpus, mode, loss_scaling)


	if __name__ == '__main__':
	test.main()