first commit
This commit is contained in:
56
Check_Graph_Card.sh
Normal file
56
Check_Graph_Card.sh
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env bash
|
||||
# 检查系统识别到的 GPU 卡数和驱动状态
|
||||
echo "======== PCIe 在位 =========="
|
||||
lspci -d 10de: | grep -i vga
|
||||
echo
|
||||
echo "======== 驱动认到几卡 ========"
|
||||
nvidia-smi --list-gpus | wc -l
|
||||
echo
|
||||
# nvidia: probe of 0000:04:06.0 failed with error -1
|
||||
# NVRM: The NVIDIA probe routine failed for 1 device(s).
|
||||
echo "======== dmesg 关键报错 ======"
|
||||
dmesg | grep -iE 'nvidia.*fail|nvidia.*error|Xid.*79|GSP.*timeout' | tail -10
|
||||
|
||||
# 当前信息记录:
|
||||
# Ubuntu中识别到的7张卡
|
||||
# 卡0:00000000:04:00.0
|
||||
# 卡1:00000000:04:02.0
|
||||
# 卡2:00000000:04:04.0
|
||||
# 坏卡-第一次:卡3:00000000:04:06.0
|
||||
# 卡4:00000000:05:00.0
|
||||
# 坏卡-第二次:卡5:00000000:05:02.0
|
||||
# 卡6:00000000:05:04.0
|
||||
# 卡7:00000000:05:06.0
|
||||
# Exsi服务器识别到的7张卡
|
||||
# 卡0:0000:16:00.0
|
||||
# 卡1:0000:38:00.0
|
||||
# 卡2:0000:49:00.0
|
||||
# 坏卡卡槽-第一次:卡3:0000:5a:00.0
|
||||
# 卡4:0000:98:00.0
|
||||
# 坏卡卡槽-第二次:卡5:0000:b8:00.0
|
||||
# 卡6:0000:c8:00.0
|
||||
# 卡7:0000:d8:00.0
|
||||
|
||||
# 解决方案尝试
|
||||
# V1.
|
||||
# 1. 关闭图形界面
|
||||
# sudo systemctl set-default multi-user.target
|
||||
# sudo systemctl set-default multi-user.target
|
||||
# 2. 立刻关 GSP 重新加载驱动
|
||||
# sudo modprobe -r nvidia_drm nvidia_modeset nvidia nvidia_uvm
|
||||
# sudo modprobe nvidia NVreg_EnableGpuFirmware=0
|
||||
# sudo nvidia-smi
|
||||
# 3. 若 8 卡出现 → 就是 GSP 问题,长期生效:
|
||||
# echo "options nvidia NVreg_EnableGpuFirmware=0" | sudo tee /etc/modprobe.d/nvidia-disable-gsp.conf
|
||||
# sudo update-initramfs -u
|
||||
|
||||
# V2.
|
||||
# 1. 看是不是 BAR 空间不足
|
||||
# sudo dmesg | grep -i "BAR 0\|resource 0" | grep 04:06.0 # TODO 变为对应的显卡号
|
||||
# [ 4.747643] pci 0000:04:06.0: BAR 0 [mem 0xea000000-0xeaffffff]
|
||||
# 内核已经成功为 04:06.0 分配了 BAR 0,大小 16 MiB,没有报 “can’t allocate” 或 “failed”,因此 BAR 空间不足/Above 4G Decoding 问题可以排除
|
||||
|
||||
# V3.
|
||||
# 彻底冷复位
|
||||
# 宿主机或 云控制台 → 断电 10 秒再上电
|
||||
#
|
||||
Reference in New Issue
Block a user