Clean up test running scripts (#65508)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65508

This has some misc cleanups for the code that happens before `run_test.py`:

* remove hardcoding of 2 shards
* add `set -eux` in some places

Test Plan: Imported from OSS

Reviewed By: seemethere

Differential Revision: D31296509

Pulled By: driazati

fbshipit-source-id: 2df1463432846d8a4d8a579812a4e9c3b7c2b957
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 8cb5ca1..063cae0 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -223,6 +223,8 @@
         # Time out the test phase after !{{ timeout_after }} minutes
         timeout-minutes: !{{ timeout_after }}
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 7cf1de0..fcaac84 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
index fe066df..0f30322 100644
--- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
index 4cc9409..302e552 100644
--- a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 2a7ae28..608482a 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 3684b28..3b01b79 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
index bb70853..378f6d2 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
index aa62f34..7ad7609 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 8718b16..1579c91 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
index b01e710..c8fa80b 100644
--- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
@@ -395,6 +395,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
index 1a0c122..7d54cb6 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
@@ -393,6 +393,8 @@
         # Time out the test phase after 360 minutes
         timeout-minutes: 360
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index c60a775..532772d 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -393,6 +393,8 @@
         # Time out the test phase after 240 minutes
         timeout-minutes: 240
         run: |
+          set -x
+
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index af6755c..2f12955 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -4,12 +4,11 @@
 # (This is set by default in the Docker images we build, so you don't
 # need to set it yourself.
 
+set -ex
+
 # shellcheck disable=SC2034
 COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}"
 
-# Get fully qualified path using realpath
-CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}")
-
 TORCH_INSTALL_DIR=$(python -c "import site; print(site.getsitepackages()[0])")/torch
 TORCH_BIN_DIR="$TORCH_INSTALL_DIR"/bin
 TORCH_LIB_DIR="$TORCH_INSTALL_DIR"/lib
@@ -24,6 +23,12 @@
     BUILD_ENVIRONMENT="${BUILD_ENVIRONMENT}-${TEST_CONFIG}"
 fi
 
+# Get fully qualified path using realpath
+if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}")
+fi
+
+
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
@@ -69,6 +74,9 @@
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # Print GPU info
   rocminfo | grep -E 'Name:.*\sgfx|Marketing'
+
+  # Manually set NUM_TEST_SHARDS since Jenkins doesn't do it
+  export NUM_TEST_SHARDS=2
 fi
 
 # --user breaks ppc64le builds and these packages are already in ppc64le docker
@@ -154,13 +162,12 @@
   assert_git_not_dirty
 }
 
-test_python_shard1() {
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
-  assert_git_not_dirty
-}
-
-test_python_shard2() {
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard() {
+  if [[ -z "$NUM_TEST_SHARDS" ]]; then
+    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
+    exit 1
+  fi
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -415,7 +422,7 @@
   assert_git_not_dirty
 }
 
-# Do NOT run this test before any other tests, like test_python_shard1, etc.
+# Do NOT run this test before any other tests, like test_python_shard, etc.
 # Because this function uninstalls the torch built from branch, and install
 # nightly version.
 test_backward_compatibility() {
@@ -521,11 +528,11 @@
   fi
   test_without_numpy
   install_torchvision
-  test_python_shard1
+  test_python_shard 1
   test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then
   install_torchvision
-  test_python_shard2
+  test_python_shard 2
   test_libtorch
   test_aot_compilation
   test_custom_script_ops