torch/_functorch/config.py - platform/external/pytorch - Git at Google

 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

 """
 Global flags for aot autograd
 """
 import os
 import sys
 from typing import TYPE_CHECKING


 # Converts torch rng ops to their functional philox rng equivalents. Note that
 # we functionalize only CUDA rng ops today.
 functionalize_rng_ops = False

 # can be useful for debugging if we are incorrectly creating meta fake tensors
 fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", "1") != "0"

 # Enables optional asserts in hotpath code to check for errors.  If
 # you are seeing weird accuracy problems, try turning this on.
 # This is currently off by default as it will harm tracing time,
 # but it is on by default for aot_eager.
 debug_assert = False

 debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", "0") != "0"

 # Today, if you are in a situation where there is "false aliasing"
 # (e.g. you have a bunch of model parameters that all alias the same underlying buffer),
 # our checks for this situation are very slow if these inputs have dynamic shapes.
 # This config is set to ensure that there aren't too many aliased inputs in this situation,
 # so that we error loudly instead of compiling forever.
 # Eventually, we should make these checks faster.
 # For now, however, you can simply turn off dynamic shapes by marking your inputs static
 # when you run into this situation.
 _max_aliased_inputs_with_dynamic_shapes_enabled = 5

 static_weight_shapes = True

 # Applies CSE to the graph before partitioning
 cse = True


 enable_autograd_cache = os.environ.get("ENABLE_AOT_AUTOGRAD_CACHE", "0") == "1"

 # When AOTAutograd regenerates aliased graph outputs,
 # attempt to use functionalization's view-replay logic
 # before falling back to the autograd engine's view replay or as_strided.
 # This can have some perf implications
 # (although for many models this will not matter).
 # (1) If you have many view ops chained together, replaying all of them
 #     at runtime can have more overhead compared to a single as_strided call
 # (2) If you are doing training, AsStridedBackward is quite slow,
 #     and the individual view op backward formulas will likely be faster.
 # (3) Some backends like XLA do not support as_strided

 # Temporary hack: disable this flag for internal
 # (needed to fix an internal issue while avoiding bumping XLA pin)
 # eventually: either default this config to false completely
 # once XLA pin update works,
 # or default config to true and fix relevant bugs
 from torch._inductor.config import is_fbcode


 # View replay is currently not compatible with AOTAutogradCache, since
 # FunctionalTensors are not serializable. We'll need to make them
 # serializable before enabling warm cache with this config turned on.
 view_replay_for_aliased_outputs = (not is_fbcode()) and (not enable_autograd_cache)

 # Restricts the amount of computation AOTAutograd can do.
 # NB: We have essentially disabled this heuristic now. However, this is kept
 # here for now in case it's useful. Setting it low can artificially reduce the
 # amount of recomputation AOTAutograd performs, although not in any kind of
 # principled way.
 max_dist_from_bw = 1000


 # Bans recomputation of nodes that are reading from nodes that is far before
 # the current node
 ban_recompute_used_far_apart = True
 # Breaks up long chain of fusible ops, as otherwise we can have an arbitrarily
 # long chain of recomputation in the backwards pass.
 ban_recompute_long_fusible_chains = True
 # Bans recomputation of nodes that must be materialized in the backwards pass
 # (used by a non-fusible node)
 ban_recompute_materialized_backward = True
 # Chooses to ban recomputation of nodes based off an allowlist. Setting it to
 # False changes it to use a denylist. Main change is on operators like
 # sort/pool/stuff that isn't cheap enough to be fusible for free but also isn't
 # that expensive
 ban_recompute_not_in_allowlist = True
 # Chooses to ban recomputation of reductions. This is generally a good idea, as
 # the result of reductions is generally very small but recomputing reductions in
 # a fusion can be expensive.
 ban_recompute_reductions = True
 # Prevents the partitioner from ever saving views (i.e. always recompute them).
 # Generally a good idea since views are free to recompute.
 recompute_views = False

 # By default, the partitioner is purely trying to optimize for runtime (although
 # it should always use less memory than eager)
 # This knob controls the partitioner to make that tradeoff for you, choosing the
 # fastest option that saves less activations than the memory budget.
 # Specifically, 0.0 corresponds to the activation memory from applying
 # activation checkpointing to the full compiled region, and 1.0 corresponds to
 # the activation memory from the default runtime-optimized strategy.  So, 0.4
 # would result in a strategy that saves 40% of the activations compared to the
 # default strategy.
 # It solves a 0-1 knapsack to find the minimum recompute necessary to stay below
 # the activation memory budget.
 # NOTE: This *cannot* be treated as
 activation_memory_budget = 1.0

 # This controls how we estimate the runtime when deciding what the cheapest
 # operators to recompute are. The 3 options are
 # "flops": Bases it off of the flop count provided by torch.utils.flop_counter
 # "profile": Benchmarks each operator to come up with a runtime
 # "testing": Returns 1 for everything
 activation_memory_budget_runtime_estimator = "flops"

 # This controls the solver used for the 0-1 knapsack. By default we use a
 # quantized DP solution ("dp"). The other approaches are a "greedy" and a "ilp"
 # (which has a scipy dependency).
 activation_memory_budget_solver = "dp"

 # This dumps out a png visualization of the expected runtime vs. activation
 # memory tradeoffs for all memory budget values from 0 to 1 in increments of
 # 0.5. See an example here:
 # https://github.com/pytorch/pytorch/pull/126320#discussion_r1625104015
 visualize_memory_budget_pareto = (
     os.environ.get("PARTITIONER_MEMORY_BUDGET_PARETO", "0") == "1"
 )

 # Sets all of the ban_recompute heuristics to False except ban_recompute_reductions
 # Generally, this will probably result in some memory improvement, but at the
 # cost of some performance
 aggressive_recomputation = False

 # If FakeTensor.data_ptr() should error.
 # This option is independent of AOTAutograd and torch.compile, but our policy
 # is to turn it off during torch.compile.
 fake_tensor_allow_unsafe_data_ptr_access = True

 # Unlifts effect tokens from the inputs/outputs in the traced graph and instead
 # inserts make_token/sink_token calls in the graph to create tokens and then
 # sink them at the end. Note that this means the graph is no longer functional
 # which may lead to silent errors unless the backend knows how to handle the
 # tokens.
 unlift_effect_tokens = False

 # This mode specifies that we should also keep track of the real
 # tensor along with the fake tensor, and do real compute.  While
 # seemingly this eliminates the whole point of fake tensors, there are
 # two obvious use cases for it:
 #
 #   1. When users call item()/other data dependent operations,
 #      if we propagate_real_tensors we are able to determine what
 #      the true value is and keep going.
 #
 #   2. It can be useful for testing, when you want to see if the fake
 #      and real tensors agree with each other.  (Note that there are
 #      currently known inaccuracies in how we clone real tensors, that
 #      would have to be tightened up for this to be useful in this
 #      case.)
 #
 # Note that fake tensors are typically understood to be cheap to store
 # indefinitely, so we tend to hold on to them longer than we would
 # hold onto the real tensors.  So we also support you explicitly
 # deallocating the real tensor associated with a fake tensor, at which
 # point we will stop propagating real tensors.
 #
 # One more thing: when you provide a real tensor to fakeify, we will
 # clone it, so that we can safely perform mutations on it if necessary.
 # This will increase live memory usage.  This could potentially be
 # optimized by using COW.  We also currently do not faithfully
 # maintain autograd metadata on the real tensor; this is fine because
 # AOTAutograd will only use the fake tensor to determine leafness/etc
 # of tensors in question.
 fake_tensor_propagate_real_tensors = False

 # This controls whether we collect donated buffer. This flag must be set
 # False if a user wants to retain_graph=True for backward.
 donated_buffer = False

 # Controls the default graph output format used by draw_graph
 # Supported formats are defined here https://graphviz.org/docs/outputs/
 torch_compile_graph_format = os.environ.get("TORCH_COMPILE_GRAPH_FORMAT", "svg")


 # Error on BypassAOTAutogradCache instead of just a warning
 # Used for tests
 strict_autograd_cache = False

 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403

 from torch.utils._config_module import install_config_module


 # adds patch, save_config, invalid config checks, etc
 install_config_module(sys.modules[__name__])
	# Copyright (c) Facebook, Inc. and its affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Global flags for aot autograd
	"""
	import os
	import sys
	from typing import TYPE_CHECKING


	# Converts torch rng ops to their functional philox rng equivalents. Note that
	# we functionalize only CUDA rng ops today.
	functionalize_rng_ops = False

	# can be useful for debugging if we are incorrectly creating meta fake tensors
	fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", "1") != "0"

	# Enables optional asserts in hotpath code to check for errors. If
	# you are seeing weird accuracy problems, try turning this on.
	# This is currently off by default as it will harm tracing time,
	# but it is on by default for aot_eager.
	debug_assert = False

	debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", "0") != "0"

	# Today, if you are in a situation where there is "false aliasing"
	# (e.g. you have a bunch of model parameters that all alias the same underlying buffer),
	# our checks for this situation are very slow if these inputs have dynamic shapes.
	# This config is set to ensure that there aren't too many aliased inputs in this situation,
	# so that we error loudly instead of compiling forever.
	# Eventually, we should make these checks faster.
	# For now, however, you can simply turn off dynamic shapes by marking your inputs static
	# when you run into this situation.
	_max_aliased_inputs_with_dynamic_shapes_enabled = 5

	static_weight_shapes = True

	# Applies CSE to the graph before partitioning
	cse = True


	enable_autograd_cache = os.environ.get("ENABLE_AOT_AUTOGRAD_CACHE", "0") == "1"

	# When AOTAutograd regenerates aliased graph outputs,
	# attempt to use functionalization's view-replay logic
	# before falling back to the autograd engine's view replay or as_strided.
	# This can have some perf implications
	# (although for many models this will not matter).
	# (1) If you have many view ops chained together, replaying all of them
	# at runtime can have more overhead compared to a single as_strided call
	# (2) If you are doing training, AsStridedBackward is quite slow,
	# and the individual view op backward formulas will likely be faster.
	# (3) Some backends like XLA do not support as_strided

	# Temporary hack: disable this flag for internal
	# (needed to fix an internal issue while avoiding bumping XLA pin)
	# eventually: either default this config to false completely
	# once XLA pin update works,
	# or default config to true and fix relevant bugs
	from torch._inductor.config import is_fbcode


	# View replay is currently not compatible with AOTAutogradCache, since
	# FunctionalTensors are not serializable. We'll need to make them
	# serializable before enabling warm cache with this config turned on.
	view_replay_for_aliased_outputs = (not is_fbcode()) and (not enable_autograd_cache)

	# Restricts the amount of computation AOTAutograd can do.
	# NB: We have essentially disabled this heuristic now. However, this is kept
	# here for now in case it's useful. Setting it low can artificially reduce the
	# amount of recomputation AOTAutograd performs, although not in any kind of
	# principled way.
	max_dist_from_bw = 1000


	# Bans recomputation of nodes that are reading from nodes that is far before
	# the current node
	ban_recompute_used_far_apart = True
	# Breaks up long chain of fusible ops, as otherwise we can have an arbitrarily
	# long chain of recomputation in the backwards pass.
	ban_recompute_long_fusible_chains = True
	# Bans recomputation of nodes that must be materialized in the backwards pass
	# (used by a non-fusible node)
	ban_recompute_materialized_backward = True
	# Chooses to ban recomputation of nodes based off an allowlist. Setting it to
	# False changes it to use a denylist. Main change is on operators like
	# sort/pool/stuff that isn't cheap enough to be fusible for free but also isn't
	# that expensive
	ban_recompute_not_in_allowlist = True
	# Chooses to ban recomputation of reductions. This is generally a good idea, as
	# the result of reductions is generally very small but recomputing reductions in
	# a fusion can be expensive.
	ban_recompute_reductions = True
	# Prevents the partitioner from ever saving views (i.e. always recompute them).
	# Generally a good idea since views are free to recompute.
	recompute_views = False

	# By default, the partitioner is purely trying to optimize for runtime (although
	# it should always use less memory than eager)
	# This knob controls the partitioner to make that tradeoff for you, choosing the
	# fastest option that saves less activations than the memory budget.
	# Specifically, 0.0 corresponds to the activation memory from applying
	# activation checkpointing to the full compiled region, and 1.0 corresponds to
	# the activation memory from the default runtime-optimized strategy. So, 0.4
	# would result in a strategy that saves 40% of the activations compared to the
	# default strategy.
	# It solves a 0-1 knapsack to find the minimum recompute necessary to stay below
	# the activation memory budget.
	# NOTE: This cannot be treated as
	activation_memory_budget = 1.0

	# This controls how we estimate the runtime when deciding what the cheapest
	# operators to recompute are. The 3 options are
	# "flops": Bases it off of the flop count provided by torch.utils.flop_counter
	# "profile": Benchmarks each operator to come up with a runtime
	# "testing": Returns 1 for everything
	activation_memory_budget_runtime_estimator = "flops"

	# This controls the solver used for the 0-1 knapsack. By default we use a
	# quantized DP solution ("dp"). The other approaches are a "greedy" and a "ilp"
	# (which has a scipy dependency).
	activation_memory_budget_solver = "dp"

	# This dumps out a png visualization of the expected runtime vs. activation
	# memory tradeoffs for all memory budget values from 0 to 1 in increments of
	# 0.5. See an example here:
	# https://github.com/pytorch/pytorch/pull/126320#discussion_r1625104015
	visualize_memory_budget_pareto = (
	os.environ.get("PARTITIONER_MEMORY_BUDGET_PARETO", "0") == "1"
	)

	# Sets all of the ban_recompute heuristics to False except ban_recompute_reductions
	# Generally, this will probably result in some memory improvement, but at the
	# cost of some performance
	aggressive_recomputation = False

	# If FakeTensor.data_ptr() should error.
	# This option is independent of AOTAutograd and torch.compile, but our policy
	# is to turn it off during torch.compile.
	fake_tensor_allow_unsafe_data_ptr_access = True

	# Unlifts effect tokens from the inputs/outputs in the traced graph and instead
	# inserts make_token/sink_token calls in the graph to create tokens and then
	# sink them at the end. Note that this means the graph is no longer functional
	# which may lead to silent errors unless the backend knows how to handle the
	# tokens.
	unlift_effect_tokens = False

	# This mode specifies that we should also keep track of the real
	# tensor along with the fake tensor, and do real compute. While
	# seemingly this eliminates the whole point of fake tensors, there are
	# two obvious use cases for it:
	#
	# 1. When users call item()/other data dependent operations,
	# if we propagate_real_tensors we are able to determine what
	# the true value is and keep going.
	#
	# 2. It can be useful for testing, when you want to see if the fake
	# and real tensors agree with each other. (Note that there are
	# currently known inaccuracies in how we clone real tensors, that
	# would have to be tightened up for this to be useful in this
	# case.)
	#
	# Note that fake tensors are typically understood to be cheap to store
	# indefinitely, so we tend to hold on to them longer than we would
	# hold onto the real tensors. So we also support you explicitly
	# deallocating the real tensor associated with a fake tensor, at which
	# point we will stop propagating real tensors.
	#
	# One more thing: when you provide a real tensor to fakeify, we will
	# clone it, so that we can safely perform mutations on it if necessary.
	# This will increase live memory usage. This could potentially be
	# optimized by using COW. We also currently do not faithfully
	# maintain autograd metadata on the real tensor; this is fine because
	# AOTAutograd will only use the fake tensor to determine leafness/etc
	# of tensors in question.
	fake_tensor_propagate_real_tensors = False

	# This controls whether we collect donated buffer. This flag must be set
	# False if a user wants to retain_graph=True for backward.
	donated_buffer = False

	# Controls the default graph output format used by draw_graph
	# Supported formats are defined here https://graphviz.org/docs/outputs/
	torch_compile_graph_format = os.environ.get("TORCH_COMPILE_GRAPH_FORMAT", "svg")


	# Error on BypassAOTAutogradCache instead of just a warning
	# Used for tests
	strict_autograd_cache = False

	if TYPE_CHECKING:
	from torch.utils._config_typing import * # noqa: F401, F403

	from torch.utils._config_module import install_config_module


	# adds patch, save_config, invalid config checks, etc
	install_config_module(sys.modules[__name__])