Fix typo under torch/_inductor directory (#110530)

This PR fixes typo of comments and messages in files under `torch/_dynamo` directory.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/110530
Approved by: https://github.com/kit1980
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index 7a840ec..632290d 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -313,7 +313,7 @@
 
         results = {}
 
-        # Use a ThreadExecutorPool to spread the work across the subproccesses and
+        # Use a ThreadExecutorPool to spread the work across the subprocesses and
         # to grab subprocesses as soon as they're free.
         for choice, result in zip(choices, self.executor.map(self.target, choices)):
             results[choice] = result
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
index b0b2aef..efbc0de 100644
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -21,7 +21,7 @@
 {{template.globals().getvalue()}}
 {{instance_definition}}
 // When workspace_size is not a nullptr, populates requested workspace_size and returns.
-// Otherwise, compuates the Gemm kernel using the given workspace ptr.
+// Otherwise, computes the Gemm kernel using the given workspace ptr.
 extern "C" {
 {{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
   try {
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 948c35f..fe66241 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1271,7 +1271,7 @@
                     return None
                 elif assert_min and assert_max:
                     # The conditions need to be in parens because of Python's operator precedence.
-                    # It'd be less error-prone to use and/or/not, which is suported by triton
+                    # It'd be less error-prone to use and/or/not, which is supported by triton
                     cond = f"(0 <= {self.var}) & ({self.var} < {size_str})"
                     cond_print = f"0 <= {self.var} < {size_str}"
                 elif assert_min:
@@ -1918,7 +1918,7 @@
         for numel in self.numels:
             numel_hint = V.graph.sizevars.symbolic_hint(numel)
             if not isinstance(numel_hint, (int, sympy.Integer)):
-                # This default heuristic hint was picked carefuly: it is
+                # This default heuristic hint was picked carefully: it is
                 # large, to ensure that we don't shrink the block size (since
                 # if you don't have many elements, it'd be wasteful to pick a
                 # large block size).  Since we don't know how many elements we
@@ -2184,7 +2184,7 @@
         for arg_name in call_args:
             buf = V.graph.get_buffer(arg_name)
             if buf and len(buf.layout.size) == 4:
-                # ignore the tensor if only 1 dimention is non-zero
+                # ignore the tensor if only 1 dimension is non-zero
                 if len([x for x in buf.layout.size if x == 1]) == 3:
                     continue
                 stride_order = ir.get_stride_order(buf.layout.stride)
@@ -2519,7 +2519,7 @@
             if not any(
                 isinstance(n, ForeachKernelSchedulerNode) for n in node_schedule
             ):
-                # We probablly should look what are the nodes inside a foreach
+                # We probably should look what are the nodes inside a foreach
                 # schedule node
                 node_names = [
                     n.get_name()
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index cf6277d..e469a15 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -1116,7 +1116,7 @@
                 f"""std::vector<at::Tensor> {self.call_func_name}(const std::vector<at::Tensor>& inputs) {{"""
             )
         with self.prefix.indent():
-            # assign inputs and outpus in both cases so the later codegen can be simplified
+            # assign inputs and outputs in both cases so the later codegen can be simplified
             if V.graph.aot_mode:
                 if config.aot_inductor.abi_compatible:
                     self.prefix.splice(
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 85dd1d4..c77e16f 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -297,7 +297,7 @@
     """
     Inductor API that compiles a single graph.
 
-    If you change the argument list for this funtion, make sure you
+    If you change the argument list for this function, make sure you
     also update the call to save_args_for_compile_fx_inner below accordingly.
     """
     if dynamo_utils.count_calls(gm.graph) == 0:
@@ -361,7 +361,7 @@
 
         # doesnt work for non-trees because the warmup run would apply mutation twice
         if config.triton.cudagraph_trees:
-            # checking if mutation is only on paramameters/static inputs
+            # checking if mutation is only on parameters/static inputs
             has_mutation = not all(
                 idx < num_fixed for idx in compiled_graph.mutated_input_idxs
             )
@@ -1076,9 +1076,9 @@
             # For training
             #   len(orig_model_outputs) <= len(model_outputs)
             # During training, most of the time the model_outputs starts with
-            # orignal module's outputs followed by saved activations.
+            # original module's outputs followed by saved activations.
             # But this can be not true if the model have inplace updated tensors.
-            # AOTAutograd will make those tensors being returned before the orignal
+            # AOTAutograd will make those tensors being returned before the original
             # module's output.
             # To make things safe, we'll use original_output_start_index field
             # set by AOTAutograd to decide where the original module outputs start.
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 2f9884e..13bfa43 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -48,7 +48,7 @@
 # enable pattern match+replace optimizations
 pattern_matcher = True
 
-# register custom graph optimizatin pass hook. so far, pre/post passes are
+# register custom graph optimization pass hook. so far, pre/post passes are
 # only applied before/after pattern_matcher in post_grad_passes.
 #
 # def my_custom_pre_pass(graph: torch.fx.graph.Graph):
@@ -423,7 +423,7 @@
     # the max number of spills we allow for the configs we benchmark.
     # Setting this to 0 means we skip a config if it spills even a single
     # register.
-    # Settting it to a larger value allows a config spilling a small amount
+    # Setting it to a larger value allows a config spilling a small amount
     # of registers being benchmarked.
     #
     # NOTE: triton will always report >0 register spills for kernels using sin/cos.
diff --git a/torch/_inductor/coordinate_descent_tuner.py b/torch/_inductor/coordinate_descent_tuner.py
index 016add2..d0e1efd 100644
--- a/torch/_inductor/coordinate_descent_tuner.py
+++ b/torch/_inductor/coordinate_descent_tuner.py
@@ -38,10 +38,10 @@
     """
     The coordinate descent tuner. Tune one field/coordinate at a time.
 
-    TODO will it be necessary to tune multiple fields simultanuously.
+    TODO will it be necessary to tune multiple fields simultaneously.
 
 
-    TODO: what if both increasing and descreasing a field can improve perf.
+    TODO: what if both increasing and decreasing a field can improve perf.
           i.e., there are multiple local optima..
     """
 
@@ -224,7 +224,7 @@
         Check if candidate_config is better than best_config.
 
         Return a touple of (compare_result, candidate_timing).
-        compare_result is true iff condidate_config is better.
+        compare_result is true iff candidate_config is better.
         """
         log.debug("Try config %s", candidate_config)
         try:
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 98b3330..3fe547e 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -247,7 +247,7 @@
     elif 1 < len(filtered_tensors) < len(tensors):
         # on the first call, when we remove empty tensors, we redispatch recursively
         return aten.cat.default(filtered_tensors, dim)
-    # when no 'filtering' has occured, we raise to prevent infinite recursion (no more decomposition needed)
+    # when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
     return NotImplemented
 
 
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 4d1b06f..cc120c0 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -642,7 +642,7 @@
     )
     def dequant_promotion(match: Match, *args, **kwargs):
         # If dequant pattern used by multiply nodes,
-        # we will do dequant promotion. So each user node has a seperate dequant pattern connected.
+        # we will do dequant promotion. So each user node has a separate dequant pattern connected.
         def clone_to_new_node(graph, source_node, user_node):
             assert (
                 source_node.op == "call_function"
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 790be43..860b214 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -346,7 +346,7 @@
         #
         # We disable layout optimization if a model contains aten._scaled_dot_product_flash_attention.
         #
-        # An alternative is to do necessary layout convertion to make sure aten._scaled_dot_product_flash_attention's
+        # An alternative is to do necessary layout conversion to make sure aten._scaled_dot_product_flash_attention's
         # inputs have the layout needed. But that seems to have worse perf than disabing the layout opt.
         # TODO(shunting) revisit if we can still apply layout optimization to models containing sdpa while
         # bringing perf gains.
@@ -909,7 +909,7 @@
 
         device_types = self.device_types.copy()
         # In terms of some operations that don't have input tensors, we need to
-        # check the deivce of the buffers.
+        # check the device of the buffers.
         for buffer in self.buffers:
             device_types.add(buffer.get_device().type)
         device_types.discard("cpu")
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 79af596..07ba097 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -5918,7 +5918,7 @@
     Wrap the input of your inplace op to enable this behavior.
 
     The design is based on two key decisions:
-    - this node is resposible for allocating the in/out buffer used by the collective.
+    - this node is responsible for allocating the in/out buffer used by the collective.
         This is controlled by the ``should_allocate`` method that returns True here and
         False for the collective node
     - The scheduler special-case this node and enable it to reuse its input.
@@ -5958,7 +5958,7 @@
 class MultiOutputNoSizeAssert(MultiOutput):
     """
     Extract partial output from a multi-output OP.
-    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emiting this.
+    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emitting this.
     """
 
     def __init__(self, layout, input, index):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index d1823a6..eafc9dd 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1553,7 +1553,7 @@
         if torch._dynamo.config.suppress_errors:
             torch._dynamo.config.suppress_errors = False
             log.warning(
-                "A make_fallback error occured in suppress_errors config,"
+                "A make_fallback error occurred in suppress_errors config,"
                 " and suppress_errors is being disabled to surface it."
             )
         raise AssertionError(
@@ -1584,7 +1584,7 @@
 @register_lowering(torch.ops.rngprims.philox_rand, type_promotion_kind=None)
 def philox_rand(size, seed, offset, stride, device, dtype):
     # stride arg is optional and will be used in future for distributed random
-    # ops. Currently, its ununsed.
+    # ops. Currently, its unused.
     random_pos = ir.FixedLayout(
         device,
         dtype,
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index c3d3188..bb450a4 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -531,7 +531,7 @@
         if not isinstance(node, (list, tuple)) or len(node) == 0:
             return FailedMatch("non_list")
         m = Match(self)
-        # Propogating patterns with multiple users will ensure we don't revisit
+        # Propagating patterns with multiple users will ensure we don't revisit
         # the same nodes
         pattern_to_node = ctx.filter_multi_user_patterns()
         matched = False
@@ -851,7 +851,7 @@
     """
     Create a replacement rule based on example functions that get traced
     to create patterns.  This supports both training and inference when
-    run on a joint foward+backward graph.
+    run on a joint forward+backward graph.
 
     Args:
         search_fn: traced to give original pattern
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index d9458f3..0194328 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1509,7 +1509,7 @@
         The current attempt is a quick, possibly hacky, heuristic to prevent the
         fusion of nodes that are far away in the original order.
 
-        A better but difficult to implement heursitic would be to use live
+        A better but difficult to implement heurisitic would be to use live
         intervals of the buffers, find region of peak pressure in the original
         program and prevent fusion that crosses that peak region. We might need
         special care or good approximation in this implementation, as fusion of
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 4ceed63..8f78daf 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -231,7 +231,7 @@
     # Note - [On Statically Known]
     #
     # The statically_known_* family of functions below replaces a prior system, called maybe_guard_*. The prior system
-    # operated by providing esentially a question, where the size hinted values were evaluted. If the condition was
+    # operated by providing essentially a question, where the size hinted values were evaluated. If the condition was
     # true, we add a guard and return True, otherwise, False.
     #
     # def maybe_guard_foo(args):
diff --git a/torch/_inductor/triton_heuristics.py b/torch/_inductor/triton_heuristics.py
index 109e1b3..c8b8ebf 100644
--- a/torch/_inductor/triton_heuristics.py
+++ b/torch/_inductor/triton_heuristics.py
@@ -1064,7 +1064,7 @@
                 triton_config_reduction(size_hints, 64, 64),
                 triton_config_reduction(size_hints, 8, 512),
                 # halve the XBLOCK/RBLOCK compared to outer_config
-                # TODO: this may only be beneficial when each iteration of the reduciton
+                # TODO: this may only be beneficial when each iteration of the reduction
                 # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
                 triton_config_reduction(size_hints, 64, 4, num_warps=8),
             ],
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index aea8ea8..97d8504 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -840,7 +840,7 @@
 @contextlib.contextmanager
 def override_lowering(aten_op, override_fn):
     """
-    Override the lowering of aten_op with overide_fn.
+    Override the lowering of aten_op with override_fn.
     The first argument of override_fn is the original lowering fn.
     """
     from torch._inductor import lowering
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
index 93b95a5..66b2b71 100644
--- a/torch/_inductor/wrapper_benchmark.py
+++ b/torch/_inductor/wrapper_benchmark.py
@@ -30,7 +30,7 @@
 def get_kernel_category(kernel_mod):
     """
     Given the module defining a triton kernel, return the category of the kernel.
-    Cateogry can be one of:
+    Category can be one of:
     - pointwise
     - reduction
     - persistent_reduction