Run C++ testcases in parallel with pytest-xdist (#101440)

After an investigation, running C++ tests with https://github.com/pytest-dev/pytest-cpp is just slower than running them directly, plain and simple. I'm curious on the exact root cause, but that's a story for another day.

`time build/bin/test_lazy` takes half a minute to run 610 tests on `linux-bionic-cuda11.8-py3.10-gcc7 / test (default, 2, 5, linux.4xlarge.nvidia.gpu)` while `time pytest /var/lib/jenkins/workspace/build/bin/test_lazy -v` takes 20+ minutes on the same runner.  This is a very costly price to pay.

The saving grace here is that https://github.com/pytest-dev/pytest-cpp supports pytest-xdist to run tests in parallel with `-n auto`, so `time pytest /var/lib/jenkins/workspace/build/bin/test_lazy -v -n auto` takes only 3 minutes.  This is still not as fast as running C++ tests directly, but it's order of magnitude faster than running them sequentially.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/101440
Approved by: https://github.com/clee2000
diff --git a/test/run_test.py b/test/run_test.py
index 52eb4fc..b0486bd 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -483,16 +483,24 @@
     unittest_args = options.additional_unittest_args.copy()
     test_file = test_module
     stepcurrent_key = test_file
+
+    use_sharded_test = False
     if isinstance(test_file, ShardedTest):
-        # C++ tests work with pytest sharding
-        unittest_args.extend(
-            [
-                f"--shard-id={test_module.shard - 1}",
-                f"--num-shards={test_module.num_shards}",
-            ]
-        )
         test_file = test_module.name
-        stepcurrent_key = f"{test_file}_{test_module.shard - 1}"
+        use_sharded_test = True
+
+    is_cpp_test = test_file.startswith(CPP_TEST_PREFIX)
+    if use_sharded_test:
+        if is_cpp_test:
+            stepcurrent_key = test_file
+        else:
+            unittest_args.extend(
+                [
+                    f"--shard-id={test_module.shard - 1}",
+                    f"--num-shards={test_module.num_shards}",
+                ]
+            )
+            stepcurrent_key = f"{test_file}_{test_module.shard - 1}"
 
     if options.verbose:
         unittest_args.append(f'-{"v"*options.verbose}')  # in case of pytest
@@ -506,7 +514,6 @@
         assert isinstance(extra_unittest_args, list)
         unittest_args.extend(extra_unittest_args)
 
-    is_cpp_test = test_file.startswith(CPP_TEST_PREFIX)
     # If using pytest, replace -f with equivalent -x
     if options.pytest:
         unittest_args.extend(
@@ -937,17 +944,20 @@
     pytest_args = [
         "-vv",
         "-rfEX",
-        "-p",
-        "no:xdist",
     ]
     if not is_cpp_test:
         # C++ tests need to be run with pytest directly, not via python
-        pytest_args.append("--use-pytest")
-    elif IS_CI:
-        # Add the option to generate XML test report here as C++ tests
-        # won't go into common_utils
-        test_report_path = get_report_path(pytest=True)
-        pytest_args.extend(["--junit-xml-reruns", test_report_path])
+        pytest_args.extend(["-p", "no:xdist", "--use-pytest"])
+    else:
+        # Use pytext-dist to run C++ tests in parallel as running them sequentially using run_test
+        # is much slower than running them directly
+        pytest_args.extend(["-n", "auto"])
+
+        if IS_CI:
+            # Add the option to generate XML test report here as C++ tests
+            # won't go into common_utils
+            test_report_path = get_report_path(pytest=True)
+            pytest_args.extend(["--junit-xml-reruns", test_report_path])
 
     if options.pytest_k_expr:
         pytest_args.extend(["-k", options.pytest_k_expr])