| # Copyright (c) Facebook, Inc. and its affiliates. |
| # All rights reserved. |
| # |
| # This source code is licensed under the BSD-style license found in the |
| # LICENSE file in the root directory of this source tree. |
| |
| """ |
| Global flags for aot autograd |
| """ |
| import os |
| import sys |
| from typing import TYPE_CHECKING |
| |
| |
| # Converts torch rng ops to their functional philox rng equivalents. Note that |
| # we functionalize only CUDA rng ops today. |
| functionalize_rng_ops = False |
| |
| # can be useful for debugging if we are incorrectly creating meta fake tensors |
| fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", "1") != "0" |
| |
| # Enables optional asserts in hotpath code to check for errors. If |
| # you are seeing weird accuracy problems, try turning this on. |
| # This is currently off by default as it will harm tracing time, |
| # but it is on by default for aot_eager. |
| debug_assert = False |
| |
| debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", "0") != "0" |
| |
| # Today, if you are in a situation where there is "false aliasing" |
| # (e.g. you have a bunch of model parameters that all alias the same underlying buffer), |
| # our checks for this situation are very slow if these inputs have dynamic shapes. |
| # This config is set to ensure that there aren't too many aliased inputs in this situation, |
| # so that we error loudly instead of compiling forever. |
| # Eventually, we should make these checks faster. |
| # For now, however, you can simply turn off dynamic shapes by marking your inputs static |
| # when you run into this situation. |
| _max_aliased_inputs_with_dynamic_shapes_enabled = 5 |
| |
| static_weight_shapes = True |
| |
| # Applies CSE to the graph before partitioning |
| cse = True |
| |
| |
| enable_autograd_cache = os.environ.get("ENABLE_AOT_AUTOGRAD_CACHE", "0") == "1" |
| |
| # When AOTAutograd regenerates aliased graph outputs, |
| # attempt to use functionalization's view-replay logic |
| # before falling back to the autograd engine's view replay or as_strided. |
| # This can have some perf implications |
| # (although for many models this will not matter). |
| # (1) If you have many view ops chained together, replaying all of them |
| # at runtime can have more overhead compared to a single as_strided call |
| # (2) If you are doing training, AsStridedBackward is quite slow, |
| # and the individual view op backward formulas will likely be faster. |
| # (3) Some backends like XLA do not support as_strided |
| |
| # Temporary hack: disable this flag for internal |
| # (needed to fix an internal issue while avoiding bumping XLA pin) |
| # eventually: either default this config to false completely |
| # once XLA pin update works, |
| # or default config to true and fix relevant bugs |
| from torch._inductor.config import is_fbcode |
| |
| |
| # View replay is currently not compatible with AOTAutogradCache, since |
| # FunctionalTensors are not serializable. We'll need to make them |
| # serializable before enabling warm cache with this config turned on. |
| view_replay_for_aliased_outputs = (not is_fbcode()) and (not enable_autograd_cache) |
| |
| # Restricts the amount of computation AOTAutograd can do. |
| # NB: We have essentially disabled this heuristic now. However, this is kept |
| # here for now in case it's useful. Setting it low can artificially reduce the |
| # amount of recomputation AOTAutograd performs, although not in any kind of |
| # principled way. |
| max_dist_from_bw = 1000 |
| |
| |
| # Bans recomputation of nodes that are reading from nodes that is far before |
| # the current node |
| ban_recompute_used_far_apart = True |
| # Breaks up long chain of fusible ops, as otherwise we can have an arbitrarily |
| # long chain of recomputation in the backwards pass. |
| ban_recompute_long_fusible_chains = True |
| # Bans recomputation of nodes that must be materialized in the backwards pass |
| # (used by a non-fusible node) |
| ban_recompute_materialized_backward = True |
| # Chooses to ban recomputation of nodes based off an allowlist. Setting it to |
| # False changes it to use a denylist. Main change is on operators like |
| # sort/pool/stuff that isn't cheap enough to be fusible for free but also isn't |
| # that expensive |
| ban_recompute_not_in_allowlist = True |
| # Chooses to ban recomputation of reductions. This is generally a good idea, as |
| # the result of reductions is generally very small but recomputing reductions in |
| # a fusion can be expensive. |
| ban_recompute_reductions = True |
| # Prevents the partitioner from ever saving views (i.e. always recompute them). |
| # Generally a good idea since views are free to recompute. |
| recompute_views = False |
| |
| # By default, the partitioner is purely trying to optimize for runtime (although |
| # it should always use less memory than eager) |
| # This knob controls the partitioner to make that tradeoff for you, choosing the |
| # fastest option that saves less activations than the memory budget. |
| # Specifically, 0.0 corresponds to the activation memory from applying |
| # activation checkpointing to the full compiled region, and 1.0 corresponds to |
| # the activation memory from the default runtime-optimized strategy. So, 0.4 |
| # would result in a strategy that saves 40% of the activations compared to the |
| # default strategy. |
| # It solves a 0-1 knapsack to find the minimum recompute necessary to stay below |
| # the activation memory budget. |
| # NOTE: This *cannot* be treated as |
| activation_memory_budget = 1.0 |
| |
| # This controls how we estimate the runtime when deciding what the cheapest |
| # operators to recompute are. The 3 options are |
| # "flops": Bases it off of the flop count provided by torch.utils.flop_counter |
| # "profile": Benchmarks each operator to come up with a runtime |
| # "testing": Returns 1 for everything |
| activation_memory_budget_runtime_estimator = "flops" |
| |
| # This controls the solver used for the 0-1 knapsack. By default we use a |
| # quantized DP solution ("dp"). The other approaches are a "greedy" and a "ilp" |
| # (which has a scipy dependency). |
| activation_memory_budget_solver = "dp" |
| |
| # This dumps out a png visualization of the expected runtime vs. activation |
| # memory tradeoffs for all memory budget values from 0 to 1 in increments of |
| # 0.5. See an example here: |
| # https://github.com/pytorch/pytorch/pull/126320#discussion_r1625104015 |
| visualize_memory_budget_pareto = ( |
| os.environ.get("PARTITIONER_MEMORY_BUDGET_PARETO", "0") == "1" |
| ) |
| |
| # Sets all of the ban_recompute heuristics to False except ban_recompute_reductions |
| # Generally, this will probably result in some memory improvement, but at the |
| # cost of some performance |
| aggressive_recomputation = False |
| |
| # If FakeTensor.data_ptr() should error. |
| # This option is independent of AOTAutograd and torch.compile, but our policy |
| # is to turn it off during torch.compile. |
| fake_tensor_allow_unsafe_data_ptr_access = True |
| |
| # Unlifts effect tokens from the inputs/outputs in the traced graph and instead |
| # inserts make_token/sink_token calls in the graph to create tokens and then |
| # sink them at the end. Note that this means the graph is no longer functional |
| # which may lead to silent errors unless the backend knows how to handle the |
| # tokens. |
| unlift_effect_tokens = False |
| |
| # This mode specifies that we should also keep track of the real |
| # tensor along with the fake tensor, and do real compute. While |
| # seemingly this eliminates the whole point of fake tensors, there are |
| # two obvious use cases for it: |
| # |
| # 1. When users call item()/other data dependent operations, |
| # if we propagate_real_tensors we are able to determine what |
| # the true value is and keep going. |
| # |
| # 2. It can be useful for testing, when you want to see if the fake |
| # and real tensors agree with each other. (Note that there are |
| # currently known inaccuracies in how we clone real tensors, that |
| # would have to be tightened up for this to be useful in this |
| # case.) |
| # |
| # Note that fake tensors are typically understood to be cheap to store |
| # indefinitely, so we tend to hold on to them longer than we would |
| # hold onto the real tensors. So we also support you explicitly |
| # deallocating the real tensor associated with a fake tensor, at which |
| # point we will stop propagating real tensors. |
| # |
| # One more thing: when you provide a real tensor to fakeify, we will |
| # clone it, so that we can safely perform mutations on it if necessary. |
| # This will increase live memory usage. This could potentially be |
| # optimized by using COW. We also currently do not faithfully |
| # maintain autograd metadata on the real tensor; this is fine because |
| # AOTAutograd will only use the fake tensor to determine leafness/etc |
| # of tensors in question. |
| fake_tensor_propagate_real_tensors = False |
| |
| # This controls whether we collect donated buffer. This flag must be set |
| # False if a user wants to retain_graph=True for backward. |
| donated_buffer = False |
| |
| # Controls the default graph output format used by draw_graph |
| # Supported formats are defined here https://graphviz.org/docs/outputs/ |
| torch_compile_graph_format = os.environ.get("TORCH_COMPILE_GRAPH_FORMAT", "svg") |
| |
| |
| # Error on BypassAOTAutogradCache instead of just a warning |
| # Used for tests |
| strict_autograd_cache = False |
| |
| if TYPE_CHECKING: |
| from torch.utils._config_typing import * # noqa: F401, F403 |
| |
| from torch.utils._config_module import install_config_module |
| |
| |
| # adds patch, save_config, invalid config checks, etc |
| install_config_module(sys.modules[__name__]) |