Add more debug information when installing NVIDIA driver (#88168)
This calls `lspci`, `lsmod`, and `modinfo nvidia` before and after the installation to gather more data about the "No GPU available" transient issue on G5 runner, i.e. https://hud.pytorch.org/pytorch/pytorch/commit/59fe272c1e698989228af5ad197bdd2985e4e9b9
This also handles `nvidia-smi` call and tries to re-install the driver if the first call fails, i.e. `No devices were found` https://hud.pytorch.org/pytorch/pytorch/commit/8ea19c802e38c061e79176360c1ecaa81ce2088a
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88168
Approved by: https://github.com/clee2000, https://github.com/malfet
diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 79f5886..c5c96e0 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -26,18 +26,29 @@
# Purge any nvidia driver installed from RHEL repo
sudo yum remove -y nvidia-driver-latest-dkms
+ # Try to gather more information about the runner and its existing NVIDIA driver if any
+ echo "Before installing NVIDIA driver"
+ lspci
+ lsmod
+ modinfo nvidia || true
+
HAS_NVIDIA_DRIVER=0
# Check if NVIDIA driver has already been installed
if [ -x "$(command -v nvidia-smi)" ]; then
+ set +e
# The driver exists, check its version next
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
+ NVIDIA_SMI_STATUS=$?
- if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
+ if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+ echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
+ elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
else
HAS_NVIDIA_DRIVER=1
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
fi
+ set -e
fi
if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
@@ -51,17 +62,25 @@
sudo rm -fv /tmp/nvidia_driver
fi
+ sudo modprobe nvidia || true
+ echo "After installing NVIDIA driver"
+ lspci
+ lsmod
+ modinfo nvidia || true
+
(
set +e
nvidia-smi
- status=$?
+ NVIDIA_SMI_STATUS=$?
+
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
- if [ $status -eq 0 ] || [ $status -eq 14 ]; then
- echo "INFO: Ignoring allowed status ${status}"
+ if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
+ echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
else
- echo "ERROR: nvidia-smi exited with unresolved status ${status}"
- exit ${status}
+ echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
+ exit ${NVIDIA_SMI_STATUS}
fi
+ set -e
)
)
}