[Inductor] Setting kernel launch and exit callbacks for inductor generated triton kernels (#119450)

`CompiledKernel.launch_enter_hook` and `CompiledKernel.launch_exit_hook` are hooks that allow external tools to monitor the execution of Triton kernels and read each kernel's metadata. Initially, these hooks have a value of `None`.

Triton's kernel launcher passes hooks and kernel metadata by default, while Inductor's launcher doesn't. This PR could unify the parameters passed to both launchers so that tools can get information from both handwritten Triton kernels and Inductor-generated Triton kernels.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/119450
Approved by: https://github.com/jansel
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index 0e85b94..401097e 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -118,6 +118,34 @@
 
             self._test_profiling_kernel_names(fn, args, "_for_")
 
+    @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+    def test_inductor_profiling_triton_hooks(self):
+        from triton.compiler import CompiledKernel
+
+        hooks_called = {"enter": False, "exit": False}
+
+        def launch_enter_hook(*args):
+            hooks_called["enter"] = True
+
+        def launch_exit_hook(*args):
+            hooks_called["exit"] = True
+
+        CompiledKernel.launch_enter_hook = launch_enter_hook
+        CompiledKernel.launch_exit_hook = launch_exit_hook
+
+        def fn(x, y):
+            return torch._foreach_add(x, y)
+
+        x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
+        y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
+
+        args = (x, y)
+        fn_opt = torch.compile(fn)
+        fn_opt(*args)
+
+        self.assertTrue(hooks_called["enter"])
+        self.assertTrue(hooks_called["exit"])
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_inductor/triton_heuristics.py b/torch/_inductor/triton_heuristics.py
index 8330615..b40a5de 100644
--- a/torch/_inductor/triton_heuristics.py
+++ b/torch/_inductor/triton_heuristics.py
@@ -399,7 +399,10 @@
 
                 runner(grid_0, grid_1, grid_2, num_warps,
                             *cta_args, shared,
-                            stream, function, None, None, None,
+                            stream, function,
+                            bin.launch_enter_hook,
+                            bin.launch_exit_hook,
+                            bin.metadata,
                             {', '.join(call_args)})
                 return bin
             """.lstrip(),