.github/workflows/_linux-test.yml - platform/external/pytorch - Git at Google

 name: linux-test

 on:
   workflow_call:
     inputs:
       build-environment:
         required: true
         type: string
         description: Top-level label for what's being built/tested.
       test-matrix:
         required: true
         type: string
         description: JSON description of what test configs to run.
       docker-image:
         required: true
         type: string
         description: Docker image to run in.
       sync-tag:
         required: false
         type: string
         default: ""
         description: |
           If this is set, our linter will use this to make sure that every other
           job with the same `sync-tag` is identical.
       timeout-minutes:
         required: false
         type: number
         default: 240
         description: |
           Set the maximum (in minutes) how long the workflow should take to finish
       use-gha:
         required: false
         type: string
         default: ""
         description: If set to any value, upload to GHA. Otherwise upload to S3.

 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

 jobs:
   test:
     # Don't run on forked repos or empty test matrix
     if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
     strategy:
       matrix: ${{ fromJSON(inputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     timeout-minutes: ${{ inputs.timeout-minutes }}
     steps:
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         if: ${{ !contains(matrix.runner, 'gcp.a100') }}
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
             All testing is done inside the container, to start an interactive session run:
               docker exec -it $(docker container ps --format '{{.ID}}') bash

       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

       - name: Setup Linux
         uses: ./.github/actions/setup-linux

       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
           docker-image: ${{ inputs.docker-image }}

       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
         if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')

       - name: Lock NVIDIA A100 40GB Frequency
         run: |
           sudo nvidia-smi -pm 1
           sudo nvidia-smi -ac 1215,1410
           nvidia-smi
         if: contains(matrix.runner, 'a100')

       - name: Start monitoring script
         id: monitor-script
         shell: bash
         continue-on-error: true
         run: |
           python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
           python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

       - name: Download build artifacts
         uses: ./.github/actions/download-build-artifacts
         with:
           name: ${{ inputs.build-environment }}

       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py

       - name: Check for keep-going label
         # This uses the filter-test-configs action because it conviniently
         # checks for labels.  It does not actually do any filtering.  All
         # filtering is done in the build step.
         id: keep-going
         uses: ./.github/actions/filter-test-configs
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           test-matrix: ${{ inputs.test-matrix }}

       - name: Set Test step time
         id: test-timeout
         shell: bash
         env:
           JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
         run: |
           echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

       - name: Test
         id: test
         timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         env:
           BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
           PYTORCH_RETRY_TEST_CASES: 1
           PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           PR_BODY: ${{ github.event.pull_request.body }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         run: |
           set -x

           if [[ $TEST_CONFIG == 'multigpu' ]]; then
             TEST_COMMAND=.ci/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
             TEST_COMMAND=.ci/onnx/test.sh
           else
             TEST_COMMAND=.ci/pytorch/test.sh
           fi

           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-main}")

           # sanitize the input commit message and PR body here:
           #
           # trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
           # variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
           COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
           PR_BODY="${PR_BODY//[$'\n\r']}"

           # then trim all special characters like single and double quotes to avoid unescaped inputs to
           # wreak havoc internally
           export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
           export PR_BODY="${PR_BODY//[\'\"]}"

           # detached container should get cleaned up by teardown_ec2_linux
           # TODO: Stop building test binaries as part of the build phase
           # Used for GPU_FLAG since that doesn't play nice
           # shellcheck disable=SC2086,SC2090
           container_name=$(docker run \
             ${GPU_FLAG:-} \
             -e BUILD_ENVIRONMENT \
             -e PR_NUMBER \
             -e GITHUB_ACTIONS \
             -e BASE_SHA \
             -e BRANCH \
             -e SHA1 \
             -e AWS_DEFAULT_REGION \
             -e IN_WHEEL_TEST \
             -e SHARD_NUMBER \
             -e TEST_CONFIG \
             -e NUM_TEST_SHARDS \
             -e PR_BODY \
             -e COMMIT_MESSAGES \
             -e CONTINUE_THROUGH_ERROR \
             -e PYTORCH_RETRY_TEST_CASES \
             -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
             -e PR_LABELS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e SCCACHE_S3_KEY_PREFIX \
             -e XLA_CUDA \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
             -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --ipc=host \
             --shm-size="${SHM_SIZE}" \
             --tty \
             --detach \
             --name="${container_name}" \
             --user jenkins \
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}"
           )
           # Propagate download.pytorch.org IP to container
           grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
           echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
           docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

       - name: Print remaining test logs
         shell: bash
         if: always() && steps.test.conclusion
         run: |
           cat test/**/*.log || true

       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
         if: always()
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}

       - name: Stop monitoring script
         if: always() && steps.monitor-script.outputs.monitor-script-pid
         shell: bash
         continue-on-error: true
         env:
           MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
         run: |
           kill "$MONITOR_SCRIPT_PID"

       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
         if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
         with:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
           use-gha: ${{ inputs.use-gha }}

       - name: Collect backtraces from coredumps (if any)
         if: always()
         run: |
           # shellcheck disable=SC2156
           find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

       - name: Store Core dumps on S3
         uses: seemethere/upload-artifact-s3@v5
         if: failure()
         with:
           name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
           retention-days: 14
           if-no-files-found: ignore
           path: ./**/core.[1-9]*

       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()

       # NB: We are currently having an intermittent GPU-related issue on G5 runners with
       # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
       # not seem to help. Here are some symptoms:
       #   * Calling nvidia-smi timeouts after 60 second
       #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
       #     unknown error
       #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
       #   * Run docker --gpus all fails with error response from daemon
       #
       # As both the root cause and recovery path are unclear, let's take the runner out of
       # service so that it doesn't get any more jobs
       - name: Check NVIDIA driver installation step
         if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
         shell: bash
         env:
           RUNNER_WORKSPACE: ${{ runner.workspace }}
         run: |
           set +e
           set -x

           nvidia-smi
           # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
           # the case where the driver has already crashed as it still can get the driver version
           # and some basic information like the bus ID.  However, the rest of the information
           # would be missing (ERR!), for example:
           #
           # +-----------------------------------------------------------------------------+
           # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
           # |-------------------------------+----------------------+----------------------+
           # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
           # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
           # |                               |                      |               MIG M. |
           # |===============================+======================+======================|
           # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
           # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
           # |                               |                      |                 ERR! |
           # +-------------------------------+----------------------+----------------------+
           #
           # +-----------------------------------------------------------------------------+
           # | Processes:                                                                  |
           # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
           # |        ID   ID                                                   Usage      |
           # |=============================================================================|
           # +-----------------------------------------------------------------------------+
           #
           # This should be reported as a failure instead as it will guarantee to fail when
           # Docker tries to run with --gpus all
           #
           # So, the correct check here is to query one of the missing piece of info like
           # GPU name, so that the command can fail accordingly
           nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
           NVIDIA_SMI_STATUS=$?

           # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
           if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
             echo "NVIDIA driver installation has failed, shutting down the runner..."
             .github/scripts/stop_runner_service.sh
           fi

           # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
           # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
           # https://github.com/pytorch/test-infra/issues/4000
           GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
           NVIDIA_SMI_STATUS=$?

           # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
           if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
             echo "NVIDIA driver installation has failed, shutting down the runner..."
             .github/scripts/stop_runner_service.sh
           fi

           # Check the GPU count to be a power of 2
           if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
             echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
             .github/scripts/stop_runner_service.sh
           fi
	name: linux-test

	on:
	workflow_call:
	inputs:
	build-environment:
	required: true
	type: string
	description: Top-level label for what's being built/tested.
	test-matrix:
	required: true
	type: string
	description: JSON description of what test configs to run.
	docker-image:
	required: true
	type: string
	description: Docker image to run in.
	sync-tag:
	required: false
	type: string
	default: ""
	description: \|
	If this is set, our linter will use this to make sure that every other
	job with the same `sync-tag` is identical.
	timeout-minutes:
	required: false
	type: number
	default: 240
	description: \|
	Set the maximum (in minutes) how long the workflow should take to finish
	use-gha:
	required: false
	type: string
	default: ""
	description: If set to any value, upload to GHA. Otherwise upload to S3.

	env:
	GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

	jobs:
	test:
	# Don't run on forked repos or empty test matrix
	if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
	strategy:
	matrix: ${{ fromJSON(inputs.test-matrix) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	timeout-minutes: ${{ inputs.timeout-minutes }}
	steps:
	- name: Setup SSH (Click me for login details)
	uses: pytorch/test-infra/.github/actions/setup-ssh@main
	if: ${{ !contains(matrix.runner, 'gcp.a100') }}
	with:
	github-secret: ${{ secrets.GITHUB_TOKEN }}
	instructions: \|
	All testing is done inside the container, to start an interactive session run:
	docker exec -it $(docker container ps --format '{{.ID}}') bash

	- name: Checkout PyTorch
	uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

	- name: Setup Linux
	uses: ./.github/actions/setup-linux

	- name: Pull docker image
	uses: pytorch/test-infra/.github/actions/pull-docker-image@main
	with:
	docker-image: ${{ inputs.docker-image }}

	- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
	id: install-nvidia-driver
	uses: pytorch/test-infra/.github/actions/setup-nvidia@main
	if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')

	- name: Lock NVIDIA A100 40GB Frequency
	run: \|
	sudo nvidia-smi -pm 1
	sudo nvidia-smi -ac 1215,1410
	nvidia-smi
	if: contains(matrix.runner, 'a100')

	- name: Start monitoring script
	id: monitor-script
	shell: bash
	continue-on-error: true
	run: \|
	python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
	python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
	echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

	- name: Download build artifacts
	uses: ./.github/actions/download-build-artifacts
	with:
	name: ${{ inputs.build-environment }}

	- name: Parse ref
	id: parse-ref
	run: .github/scripts/parse_ref.py

	- name: Check for keep-going label
	# This uses the filter-test-configs action because it conviniently
	# checks for labels. It does not actually do any filtering. All
	# filtering is done in the build step.
	id: keep-going
	uses: ./.github/actions/filter-test-configs
	with:
	github-token: ${{ secrets.GITHUB_TOKEN }}
	test-matrix: ${{ inputs.test-matrix }}

	- name: Set Test step time
	id: test-timeout
	shell: bash
	env:
	JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
	run: \|
	echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

	- name: Test
	id: test
	timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
	env:
	BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
	PR_NUMBER: ${{ github.event.pull_request.number }}
	BRANCH: ${{ steps.parse-ref.outputs.branch }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	BASE_SHA: ${{ github.event.pull_request.base.sha \|\| github.sha }}
	PYTORCH_RETRY_TEST_CASES: 1
	PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
	TEST_CONFIG: ${{ matrix.config }}
	SHARD_NUMBER: ${{ matrix.shard }}
	NUM_TEST_SHARDS: ${{ matrix.num_shards }}
	PR_BODY: ${{ github.event.pull_request.body }}
	CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
	SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
	SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
	SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' \|\| '1g' }}
	DOCKER_IMAGE: ${{ inputs.docker-image }}
	XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' \|\| '' }}
	XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
	PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' \|\| '0' }}
	PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' \|\| '0' }}
	run: \|
	set -x

	if [[ $TEST_CONFIG == 'multigpu' ]]; then
	TEST_COMMAND=.ci/pytorch/multigpu-test.sh
	elif [[ $BUILD_ENVIRONMENT == onnx ]]; then
	TEST_COMMAND=.ci/onnx/test.sh
	else
	TEST_COMMAND=.ci/pytorch/test.sh
	fi

	COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-main}")

	# sanitize the input commit message and PR body here:
	#
	# trim all new lines from commit messages + PR_BODY to avoid issues with batch environment
	# variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
	COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
	PR_BODY="${PR_BODY//[$'\n\r']}"

	# then trim all special characters like single and double quotes to avoid unescaped inputs to
	# wreak havoc internally
	export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
	export PR_BODY="${PR_BODY//[\'\"]}"

	# detached container should get cleaned up by teardown_ec2_linux
	# TODO: Stop building test binaries as part of the build phase
	# Used for GPU_FLAG since that doesn't play nice
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	${GPU_FLAG:-} \
	-e BUILD_ENVIRONMENT \
	-e PR_NUMBER \
	-e GITHUB_ACTIONS \
	-e BASE_SHA \
	-e BRANCH \
	-e SHA1 \
	-e AWS_DEFAULT_REGION \
	-e IN_WHEEL_TEST \
	-e SHARD_NUMBER \
	-e TEST_CONFIG \
	-e NUM_TEST_SHARDS \
	-e PR_BODY \
	-e COMMIT_MESSAGES \
	-e CONTINUE_THROUGH_ERROR \
	-e PYTORCH_RETRY_TEST_CASES \
	-e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
	-e PR_LABELS \
	-e MAX_JOBS="$(nproc --ignore=2)" \
	-e SCCACHE_BUCKET \
	-e SCCACHE_S3_KEY_PREFIX \
	-e XLA_CUDA \
	-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
	-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
	-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
	-e SKIP_SCCACHE_INITIALIZATION=1 \
	--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
	--ulimit stack=10485760:83886080 \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--ipc=host \
	--shm-size="${SHM_SIZE}" \
	--tty \
	--detach \
	--name="${container_name}" \
	--user jenkins \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)
	# Propagate download.pytorch.org IP to container
	grep download.pytorch.org /etc/hosts \| docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
	echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
	docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

	- name: Print remaining test logs
	shell: bash
	if: always() && steps.test.conclusion
	run: \|
	cat test/*/.log \|\| true

	- name: Get workflow job id
	id: get-job-id
	uses: ./.github/actions/get-workflow-job-id
	if: always()
	with:
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Stop monitoring script
	if: always() && steps.monitor-script.outputs.monitor-script-pid
	shell: bash
	continue-on-error: true
	env:
	MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
	run: \|
	kill "$MONITOR_SCRIPT_PID"

	- name: Upload test artifacts
	uses: ./.github/actions/upload-test-artifacts
	if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
	with:
	file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
	use-gha: ${{ inputs.use-gha }}

	- name: Collect backtraces from coredumps (if any)
	if: always()
	run: \|
	# shellcheck disable=SC2156
	find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

	- name: Store Core dumps on S3
	uses: seemethere/upload-artifact-s3@v5
	if: failure()
	with:
	name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
	retention-days: 14
	if-no-files-found: ignore
	path: ./*/core.[1-9]

	- name: Teardown Linux
	uses: pytorch/test-infra/.github/actions/teardown-linux@main
	if: always()

	# NB: We are currently having an intermittent GPU-related issue on G5 runners with
	# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
	# not seem to help. Here are some symptoms:
	# * Calling nvidia-smi timeouts after 60 second
	# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
	# unknown error
	# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
	# * Run docker --gpus all fails with error response from daemon
	#
	# As both the root cause and recovery path are unclear, let's take the runner out of
	# service so that it doesn't get any more jobs
	- name: Check NVIDIA driver installation step
	if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
	shell: bash
	env:
	RUNNER_WORKSPACE: ${{ runner.workspace }}
	run: \|
	set +e
	set -x

	nvidia-smi
	# NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
	# the case where the driver has already crashed as it still can get the driver version
	# and some basic information like the bus ID. However, the rest of the information
	# would be missing (ERR!), for example:
	#
	# +-----------------------------------------------------------------------------+
	# \| NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 \|
	# \|-------------------------------+----------------------+----------------------+
	# \| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|
	# \| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|
	# \| \| \| MIG M. \|
	# \|===============================+======================+======================\|
	# \| 0 ERR! Off \| 00000000:00:1E.0 Off \| ERR! \|
	# \|ERR! ERR! ERR! ERR! / ERR! \| 4184MiB / 23028MiB \| ERR! Default \|
	# \| \| \| ERR! \|
	# +-------------------------------+----------------------+----------------------+
	#
	# +-----------------------------------------------------------------------------+
	# \| Processes: \|
	# \| GPU GI CI PID Type Process name GPU Memory \|
	# \| ID ID Usage \|
	# \|=============================================================================\|
	# +-----------------------------------------------------------------------------+
	#
	# This should be reported as a failure instead as it will guarantee to fail when
	# Docker tries to run with --gpus all
	#
	# So, the correct check here is to query one of the missing piece of info like
	# GPU name, so that the command can fail accordingly
	nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
	NVIDIA_SMI_STATUS=$?

	# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
	if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
	echo "NVIDIA driver installation has failed, shutting down the runner..."
	.github/scripts/stop_runner_service.sh
	fi

	# For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
	# power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
	# https://github.com/pytorch/test-infra/issues/4000
	GPU_COUNT=$(nvidia-smi --list-gpus \| wc -l)
	NVIDIA_SMI_STATUS=$?

	# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
	if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
	echo "NVIDIA driver installation has failed, shutting down the runner..."
	.github/scripts/stop_runner_service.sh
	fi

	# Check the GPU count to be a power of 2
	if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
	echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
	.github/scripts/stop_runner_service.sh
	fi