Add more debug information when installing NVIDIA driver (#88168)

This calls `lspci`, `lsmod`, and `modinfo nvidia` before and after the installation to gather more data about the "No GPU available" transient issue on G5 runner, i.e. https://hud.pytorch.org/pytorch/pytorch/commit/59fe272c1e698989228af5ad197bdd2985e4e9b9

This also handles `nvidia-smi` call and tries to re-install the driver if the first call fails, i.e. `No devices were found` https://hud.pytorch.org/pytorch/pytorch/commit/8ea19c802e38c061e79176360c1ecaa81ce2088a
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88168
Approved by: https://github.com/clee2000, https://github.com/malfet
diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 79f5886..c5c96e0 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -26,18 +26,29 @@
         # Purge any nvidia driver installed from RHEL repo
         sudo yum remove -y nvidia-driver-latest-dkms
 
+        # Try to gather more information about the runner and its existing NVIDIA driver if any
+        echo "Before installing NVIDIA driver"
+        lspci
+        lsmod
+        modinfo nvidia || true
+
         HAS_NVIDIA_DRIVER=0
         # Check if NVIDIA driver has already been installed
         if [ -x "$(command -v nvidia-smi)" ]; then
+            set +e
             # The driver exists, check its version next
             INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
+            NVIDIA_SMI_STATUS=$?
 
-            if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
+            if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+                echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
+            elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
                 echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
             else
                 HAS_NVIDIA_DRIVER=1
                 echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
             fi
+            set -e
         fi
 
         if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
@@ -51,17 +62,25 @@
             sudo rm -fv /tmp/nvidia_driver
         fi
 
+        sudo modprobe nvidia || true
+        echo "After installing NVIDIA driver"
+        lspci
+        lsmod
+        modinfo nvidia || true
+
         (
             set +e
             nvidia-smi
-            status=$?
+            NVIDIA_SMI_STATUS=$?
+
             # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
-            if [ $status -eq 0 ] || [ $status -eq 14 ]; then
-                echo "INFO: Ignoring allowed status ${status}"
+            if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
+                echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
             else
-                echo "ERROR: nvidia-smi exited with unresolved status ${status}"
-                exit ${status}
+                echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
+                exit ${NVIDIA_SMI_STATUS}
             fi
+            set -e
         )
     )
 }