Clean up test running scripts (#65508)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/65508
This has some misc cleanups for the code that happens before `run_test.py`:
* remove hardcoding of 2 shards
* add `set -eux` in some places
Test Plan: Imported from OSS
Reviewed By: seemethere
Differential Revision: D31296509
Pulled By: driazati
fbshipit-source-id: 2df1463432846d8a4d8a579812a4e9c3b7c2b957
diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2
index 8cb5ca1..063cae0 100644
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@@ -223,6 +223,8 @@
# Time out the test phase after !{{ timeout_after }} minutes
timeout-minutes: !{{ timeout_after }}
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
index 7cf1de0..fcaac84 100644
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
index fe066df..0f30322 100644
--- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
index 4cc9409..302e552 100644
--- a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
index 2a7ae28..608482a 100644
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
index 3684b28..3b01b79 100644
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
index bb70853..378f6d2 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
index aa62f34..7ad7609 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
index 8718b16..1579c91 100644
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
index b01e710..c8fa80b 100644
--- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
@@ -395,6 +395,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
index 1a0c122..7d54cb6 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
@@ -393,6 +393,8 @@
# Time out the test phase after 360 minutes
timeout-minutes: 360
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
index c60a775..532772d 100644
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@@ -393,6 +393,8 @@
# Time out the test phase after 240 minutes
timeout-minutes: 240
run: |
+ set -x
+
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index af6755c..2f12955 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -4,12 +4,11 @@
# (This is set by default in the Docker images we build, so you don't
# need to set it yourself.
+set -ex
+
# shellcheck disable=SC2034
COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}"
-# Get fully qualified path using realpath
-CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}")
-
TORCH_INSTALL_DIR=$(python -c "import site; print(site.getsitepackages()[0])")/torch
TORCH_BIN_DIR="$TORCH_INSTALL_DIR"/bin
TORCH_LIB_DIR="$TORCH_INSTALL_DIR"/lib
@@ -24,6 +23,12 @@
BUILD_ENVIRONMENT="${BUILD_ENVIRONMENT}-${TEST_CONFIG}"
fi
+# Get fully qualified path using realpath
+if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
+ CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}")
+fi
+
+
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
@@ -69,6 +74,9 @@
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
# Print GPU info
rocminfo | grep -E 'Name:.*\sgfx|Marketing'
+
+ # Manually set NUM_TEST_SHARDS since Jenkins doesn't do it
+ export NUM_TEST_SHARDS=2
fi
# --user breaks ppc64le builds and these packages are already in ppc64le docker
@@ -154,13 +162,12 @@
assert_git_not_dirty
}
-test_python_shard1() {
- time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
- assert_git_not_dirty
-}
-
-test_python_shard2() {
- time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard() {
+ if [[ -z "$NUM_TEST_SHARDS" ]]; then
+ echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
+ exit 1
+ fi
+ time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" --verbose --determine-from="$DETERMINE_FROM"
assert_git_not_dirty
}
@@ -415,7 +422,7 @@
assert_git_not_dirty
}
-# Do NOT run this test before any other tests, like test_python_shard1, etc.
+# Do NOT run this test before any other tests, like test_python_shard, etc.
# Because this function uninstalls the torch built from branch, and install
# nightly version.
test_backward_compatibility() {
@@ -521,11 +528,11 @@
fi
test_without_numpy
install_torchvision
- test_python_shard1
+ test_python_shard 1
test_aten
elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then
install_torchvision
- test_python_shard2
+ test_python_shard 2
test_libtorch
test_aot_compilation
test_custom_script_ops