[PyTorch] Record Sequence Number to Match Forward and Backward Operators (2nd try) (#79753)

Summary: Add sequence number to map forward and backward operators.

Test Plan:
```
buck build mode/dev-nosan cea/ml_perf_model/gpu/scripts: --show-output
buck-out/gen/caffe2/test/profiler#binary.par test_profiler.TestExecutionGraph.test_execution_graph_start_stop
```
Outputs with seq_id: P505545974

Differential Revision: D37223871

Pull Request resolved: https://github.com/pytorch/pytorch/pull/79753
Approved by: https://github.com/robieta
diff --git a/test/test_profiler.py b/test/test_profiler.py
index 32ca3fb..98b9bf0 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -210,13 +210,14 @@
         u = torch.randn(3, 4, 5, requires_grad=True)
         with record_function("## TEST 1 ##", "1, 2, 3"):
             rf_handle = _record_function_with_args_enter("## TEST 2 ##", 1, False, 2.5, [u, u], (u, u), "hello", u)
-            x = torch.randn(10, 10)
+            x = torch.randn(10, 10, requires_grad=True)
             if use_cuda:
                 x = x.cuda()
-            y = torch.randn(10, 10)
+            y = torch.randn(10, 10, requires_grad=True)
             if use_cuda:
                 y = y.cuda()
             z = x + y + x * y + x * y
+            z.backward(z)
             if use_cuda:
                 z = z.cpu()
             _record_function_with_args_exit(rf_handle)
diff --git a/torch/csrc/profiler/execution_graph_observer.cpp b/torch/csrc/profiler/execution_graph_observer.cpp
index 2a9deb4..a3f4137 100644
--- a/torch/csrc/profiler/execution_graph_observer.cpp
+++ b/torch/csrc/profiler/execution_graph_observer.cpp
@@ -245,6 +245,7 @@
     const uint64_t rf_id,
     const uint64_t parent,
     const uint64_t fw_parent,
+    const int64_t seq_id,
     const uint64_t scope,
     const uint64_t tid,
     const uint64_t fw_tid,
@@ -258,7 +259,7 @@
   out << fmt::format(
       R"JSON(
     {{
-      "name": "{}", "id": {}, "rf_id": {}, "parent": {}, "fw_parent": {}, "scope": {}, "tid": {}, "fw_tid": {}, "op_schema": "{}",
+      "name": "{}", "id": {}, "rf_id": {}, "parent": {}, "fw_parent": {}, "seq_id": {}, "scope": {}, "tid": {}, "fw_tid": {}, "op_schema": "{}",
       "inputs": {}, "input_shapes": {}, "input_types": {},
       "outputs": {}, "output_shapes": {}, "output_types": {}
     }})JSON",
@@ -267,6 +268,7 @@
       rf_id,
       parent,
       fw_parent,
+      seq_id,
       scope,
       tid,
       fw_tid,
@@ -322,6 +324,7 @@
       0, // rf_id
       root_id, // parent is self
       0, // fw_parent
+      -1, // seq_id
       static_cast<std::underlying_type_t<RecordScope>>(RecordScope::USER_SCOPE),
       0, // tid
       0); // fw_tid
@@ -430,6 +433,7 @@
           0, // rf_id
           root_id,
           0, // fw_parent
+          -1, // seq_id
           static_cast<std::underlying_type_t<RecordScope>>(
               RecordScope::USER_SCOPE),
           tid,
@@ -546,6 +550,7 @@
           fn.handle(),
           fc.parent_id,
           fc.fw_parent_id,
+          fn.seqNr(),
           static_cast<std::underlying_type_t<RecordScope>>(fn.scope()),
           fn.threadId(),
           fn.forwardThreadId(),