56 lines
1.9 KiB
Bash
56 lines
1.9 KiB
Bash
#!/usr/bin/env bash
|
||
# 检查系统识别到的 GPU 卡数和驱动状态
|
||
echo "======== PCIe 在位 =========="
|
||
lspci -d 10de: | grep -i vga
|
||
echo
|
||
echo "======== 驱动认到几卡 ========"
|
||
nvidia-smi --list-gpus | wc -l
|
||
echo
|
||
# nvidia: probe of 0000:04:06.0 failed with error -1
|
||
# NVRM: The NVIDIA probe routine failed for 1 device(s).
|
||
echo "======== dmesg 关键报错 ======"
|
||
dmesg | grep -iE 'nvidia.*fail|nvidia.*error|Xid.*79|GSP.*timeout' | tail -10
|
||
|
||
# 当前信息记录:
|
||
# Ubuntu中识别到的7张卡
|
||
# 卡0:00000000:04:00.0
|
||
# 卡1:00000000:04:02.0
|
||
# 卡2:00000000:04:04.0
|
||
# 坏卡-第一次:卡3:00000000:04:06.0
|
||
# 卡4:00000000:05:00.0
|
||
# 坏卡-第二次:卡5:00000000:05:02.0
|
||
# 卡6:00000000:05:04.0
|
||
# 卡7:00000000:05:06.0
|
||
# Exsi服务器识别到的7张卡
|
||
# 卡0:0000:16:00.0
|
||
# 卡1:0000:38:00.0
|
||
# 卡2:0000:49:00.0
|
||
# 坏卡卡槽-第一次:卡3:0000:5a:00.0
|
||
# 卡4:0000:98:00.0
|
||
# 坏卡卡槽-第二次:卡5:0000:b8:00.0
|
||
# 卡6:0000:c8:00.0
|
||
# 卡7:0000:d8:00.0
|
||
|
||
# 解决方案尝试
|
||
# V1.
|
||
# 1. 关闭图形界面
|
||
# sudo systemctl set-default multi-user.target
|
||
# sudo systemctl set-default multi-user.target
|
||
# 2. 立刻关 GSP 重新加载驱动
|
||
# sudo modprobe -r nvidia_drm nvidia_modeset nvidia nvidia_uvm
|
||
# sudo modprobe nvidia NVreg_EnableGpuFirmware=0
|
||
# sudo nvidia-smi
|
||
# 3. 若 8 卡出现 → 就是 GSP 问题,长期生效:
|
||
# echo "options nvidia NVreg_EnableGpuFirmware=0" | sudo tee /etc/modprobe.d/nvidia-disable-gsp.conf
|
||
# sudo update-initramfs -u
|
||
|
||
# V2.
|
||
# 1. 看是不是 BAR 空间不足
|
||
# sudo dmesg | grep -i "BAR 0\|resource 0" | grep 04:06.0 # TODO 变为对应的显卡号
|
||
# [ 4.747643] pci 0000:04:06.0: BAR 0 [mem 0xea000000-0xeaffffff]
|
||
# 内核已经成功为 04:06.0 分配了 BAR 0,大小 16 MiB,没有报 “can’t allocate” 或 “failed”,因此 BAR 空间不足/Above 4G Decoding 问题可以排除
|
||
|
||
# V3.
|
||
# 彻底冷复位
|
||
# 宿主机或 云控制台 → 断电 10 秒再上电
|
||
# |