| name: linux-test |
| |
| on: |
| workflow_call: |
| inputs: |
| build-environment: |
| required: true |
| type: string |
| description: Top-level label for what's being built/tested. |
| test-matrix: |
| required: true |
| type: string |
| description: JSON description of what test configs to run. |
| docker-image: |
| required: true |
| type: string |
| description: Docker image to run in. |
| sync-tag: |
| required: false |
| type: string |
| default: "" |
| description: | |
| If this is set, our linter will use this to make sure that every other |
| job with the same `sync-tag` is identical. |
| timeout-minutes: |
| required: false |
| type: number |
| default: 240 |
| description: | |
| Set the maximum (in minutes) how long the workflow should take to finish |
| use-gha: |
| required: false |
| type: string |
| default: "" |
| description: If set to any value, upload to GHA. Otherwise upload to S3. |
| |
| env: |
| GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} |
| |
| jobs: |
| test: |
| # Don't run on forked repos or empty test matrix |
| if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' |
| strategy: |
| matrix: ${{ fromJSON(inputs.test-matrix) }} |
| fail-fast: false |
| runs-on: ${{ matrix.runner }} |
| timeout-minutes: ${{ inputs.timeout-minutes }} |
| steps: |
| - name: Setup SSH (Click me for login details) |
| uses: pytorch/test-infra/.github/actions/setup-ssh@main |
| if: ${{ !contains(matrix.runner, 'gcp.a100') }} |
| with: |
| github-secret: ${{ secrets.GITHUB_TOKEN }} |
| instructions: | |
| All testing is done inside the container, to start an interactive session run: |
| docker exec -it $(docker container ps --format '{{.ID}}') bash |
| |
| - name: Checkout PyTorch |
| uses: pytorch/pytorch/.github/actions/checkout-pytorch@main |
| |
| - name: Setup Linux |
| uses: ./.github/actions/setup-linux |
| |
| - name: Pull docker image |
| uses: pytorch/test-infra/.github/actions/pull-docker-image@main |
| with: |
| docker-image: ${{ inputs.docker-image }} |
| |
| - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG |
| id: install-nvidia-driver |
| uses: pytorch/test-infra/.github/actions/setup-nvidia@main |
| if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') |
| |
| - name: Lock NVIDIA A100 40GB Frequency |
| run: | |
| sudo nvidia-smi -pm 1 |
| sudo nvidia-smi -ac 1215,1410 |
| nvidia-smi |
| if: contains(matrix.runner, 'a100') |
| |
| - name: Start monitoring script |
| id: monitor-script |
| shell: bash |
| continue-on-error: true |
| run: | |
| python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 |
| python3 -m tools.stats.monitor > usage_log.txt 2>&1 & |
| echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" |
| |
| - name: Download build artifacts |
| uses: ./.github/actions/download-build-artifacts |
| with: |
| name: ${{ inputs.build-environment }} |
| |
| - name: Parse ref |
| id: parse-ref |
| run: .github/scripts/parse_ref.py |
| |
| - name: Check for keep-going label |
| # This uses the filter-test-configs action because it conviniently |
| # checks for labels. It does not actually do any filtering. All |
| # filtering is done in the build step. |
| id: keep-going |
| uses: ./.github/actions/filter-test-configs |
| with: |
| github-token: ${{ secrets.GITHUB_TOKEN }} |
| test-matrix: ${{ inputs.test-matrix }} |
| |
| - name: Set Test step time |
| id: test-timeout |
| shell: bash |
| env: |
| JOB_TIMEOUT: ${{ inputs.timeout-minutes }} |
| run: | |
| echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" |
| |
| - name: Test |
| id: test |
| timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} |
| env: |
| BUILD_ENVIRONMENT: ${{ inputs.build-environment }} |
| PR_NUMBER: ${{ github.event.pull_request.number }} |
| BRANCH: ${{ steps.parse-ref.outputs.branch }} |
| SHA1: ${{ github.event.pull_request.head.sha || github.sha }} |
| BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} |
| PYTORCH_RETRY_TEST_CASES: 1 |
| PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 |
| TEST_CONFIG: ${{ matrix.config }} |
| SHARD_NUMBER: ${{ matrix.shard }} |
| NUM_TEST_SHARDS: ${{ matrix.num_shards }} |
| PR_BODY: ${{ github.event.pull_request.body }} |
| CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} |
| SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 |
| SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} |
| SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} |
| DOCKER_IMAGE: ${{ inputs.docker-image }} |
| XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} |
| XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla |
| PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} |
| PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} |
| run: | |
| set -x |
| |
| if [[ $TEST_CONFIG == 'multigpu' ]]; then |
| TEST_COMMAND=.ci/pytorch/multigpu-test.sh |
| elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then |
| TEST_COMMAND=.ci/onnx/test.sh |
| else |
| TEST_COMMAND=.ci/pytorch/test.sh |
| fi |
| |
| COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-main}") |
| |
| # sanitize the input commit message and PR body here: |
| # |
| # trim all new lines from commit messages + PR_BODY to avoid issues with batch environment |
| # variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028 |
| COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}" |
| PR_BODY="${PR_BODY//[$'\n\r']}" |
| |
| # then trim all special characters like single and double quotes to avoid unescaped inputs to |
| # wreak havoc internally |
| export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}" |
| export PR_BODY="${PR_BODY//[\'\"]}" |
| |
| # detached container should get cleaned up by teardown_ec2_linux |
| # TODO: Stop building test binaries as part of the build phase |
| # Used for GPU_FLAG since that doesn't play nice |
| # shellcheck disable=SC2086,SC2090 |
| container_name=$(docker run \ |
| ${GPU_FLAG:-} \ |
| -e BUILD_ENVIRONMENT \ |
| -e PR_NUMBER \ |
| -e GITHUB_ACTIONS \ |
| -e BASE_SHA \ |
| -e BRANCH \ |
| -e SHA1 \ |
| -e AWS_DEFAULT_REGION \ |
| -e IN_WHEEL_TEST \ |
| -e SHARD_NUMBER \ |
| -e TEST_CONFIG \ |
| -e NUM_TEST_SHARDS \ |
| -e PR_BODY \ |
| -e COMMIT_MESSAGES \ |
| -e CONTINUE_THROUGH_ERROR \ |
| -e PYTORCH_RETRY_TEST_CASES \ |
| -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \ |
| -e PR_LABELS \ |
| -e MAX_JOBS="$(nproc --ignore=2)" \ |
| -e SCCACHE_BUCKET \ |
| -e SCCACHE_S3_KEY_PREFIX \ |
| -e XLA_CUDA \ |
| -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ |
| -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ |
| -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ |
| -e SKIP_SCCACHE_INITIALIZATION=1 \ |
| --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ |
| --ulimit stack=10485760:83886080 \ |
| --security-opt seccomp=unconfined \ |
| --cap-add=SYS_PTRACE \ |
| --ipc=host \ |
| --shm-size="${SHM_SIZE}" \ |
| --tty \ |
| --detach \ |
| --name="${container_name}" \ |
| --user jenkins \ |
| -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ |
| -w /var/lib/jenkins/workspace \ |
| "${DOCKER_IMAGE}" |
| ) |
| # Propagate download.pytorch.org IP to container |
| grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" |
| echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" |
| docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" |
| |
| - name: Print remaining test logs |
| shell: bash |
| if: always() && steps.test.conclusion |
| run: | |
| cat test/**/*.log || true |
| |
| - name: Get workflow job id |
| id: get-job-id |
| uses: ./.github/actions/get-workflow-job-id |
| if: always() |
| with: |
| github-token: ${{ secrets.GITHUB_TOKEN }} |
| |
| - name: Stop monitoring script |
| if: always() && steps.monitor-script.outputs.monitor-script-pid |
| shell: bash |
| continue-on-error: true |
| env: |
| MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} |
| run: | |
| kill "$MONITOR_SCRIPT_PID" |
| |
| - name: Upload test artifacts |
| uses: ./.github/actions/upload-test-artifacts |
| if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' |
| with: |
| file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} |
| use-gha: ${{ inputs.use-gha }} |
| |
| - name: Collect backtraces from coredumps (if any) |
| if: always() |
| run: | |
| # shellcheck disable=SC2156 |
| find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; |
| |
| - name: Store Core dumps on S3 |
| uses: seemethere/upload-artifact-s3@v5 |
| if: failure() |
| with: |
| name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} |
| retention-days: 14 |
| if-no-files-found: ignore |
| path: ./**/core.[1-9]* |
| |
| - name: Teardown Linux |
| uses: pytorch/test-infra/.github/actions/teardown-linux@main |
| if: always() |
| |
| # NB: We are currently having an intermittent GPU-related issue on G5 runners with |
| # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does |
| # not seem to help. Here are some symptoms: |
| # * Calling nvidia-smi timeouts after 60 second |
| # * Fail to run nvidia-smi with an unable to determine the device handle for GPU |
| # unknown error |
| # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch |
| # * Run docker --gpus all fails with error response from daemon |
| # |
| # As both the root cause and recovery path are unclear, let's take the runner out of |
| # service so that it doesn't get any more jobs |
| - name: Check NVIDIA driver installation step |
| if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' |
| shell: bash |
| env: |
| RUNNER_WORKSPACE: ${{ runner.workspace }} |
| run: | |
| set +e |
| set -x |
| |
| nvidia-smi |
| # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in |
| # the case where the driver has already crashed as it still can get the driver version |
| # and some basic information like the bus ID. However, the rest of the information |
| # would be missing (ERR!), for example: |
| # |
| # +-----------------------------------------------------------------------------+ |
| # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | |
| # |-------------------------------+----------------------+----------------------+ |
| # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | |
| # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |
| # | | | MIG M. | |
| # |===============================+======================+======================| |
| # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | |
| # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | |
| # | | | ERR! | |
| # +-------------------------------+----------------------+----------------------+ |
| # |
| # +-----------------------------------------------------------------------------+ |
| # | Processes: | |
| # | GPU GI CI PID Type Process name GPU Memory | |
| # | ID ID Usage | |
| # |=============================================================================| |
| # +-----------------------------------------------------------------------------+ |
| # |
| # This should be reported as a failure instead as it will guarantee to fail when |
| # Docker tries to run with --gpus all |
| # |
| # So, the correct check here is to query one of the missing piece of info like |
| # GPU name, so that the command can fail accordingly |
| nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 |
| NVIDIA_SMI_STATUS=$? |
| |
| # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action |
| if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then |
| echo "NVIDIA driver installation has failed, shutting down the runner..." |
| .github/scripts/stop_runner_service.sh |
| fi |
| |
| # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the |
| # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails |
| # https://github.com/pytorch/test-infra/issues/4000 |
| GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) |
| NVIDIA_SMI_STATUS=$? |
| |
| # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action |
| if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then |
| echo "NVIDIA driver installation has failed, shutting down the runner..." |
| .github/scripts/stop_runner_service.sh |
| fi |
| |
| # Check the GPU count to be a power of 2 |
| if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then |
| echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." |
| .github/scripts/stop_runner_service.sh |
| fi |