Files
Seg_Data_Server/Check_Graph_Card.sh
2026-05-20 15:05:35 +08:00

56 lines
1.9 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# 检查系统识别到的 GPU 卡数和驱动状态
echo "======== PCIe 在位 =========="
lspci -d 10de: | grep -i vga
echo
echo "======== 驱动认到几卡 ========"
nvidia-smi --list-gpus | wc -l
echo
# nvidia: probe of 0000:04:06.0 failed with error -1
# NVRM: The NVIDIA probe routine failed for 1 device(s).
echo "======== dmesg 关键报错 ======"
dmesg | grep -iE 'nvidia.*fail|nvidia.*error|Xid.*79|GSP.*timeout' | tail -10
# 当前信息记录:
# Ubuntu中识别到的7张卡
# 卡000000000:04:00.0
# 卡100000000:04:02.0
# 卡200000000:04:04.0
# 坏卡-第一次卡300000000:04:06.0
# 卡400000000:05:00.0
# 坏卡-第二次卡500000000:05:02.0
# 卡600000000:05:04.0
# 卡700000000:05:06.0
# Exsi服务器识别到的7张卡
# 卡00000:16:00.0
# 卡10000:38:00.0
# 卡20000:49:00.0
# 坏卡卡槽-第一次卡30000:5a:00.0
# 卡40000:98:00.0
# 坏卡卡槽-第二次卡50000:b8:00.0
# 卡60000:c8:00.0
# 卡70000:d8:00.0
# 解决方案尝试
# V1.
# 1. 关闭图形界面
# sudo systemctl set-default multi-user.target
# sudo systemctl set-default multi-user.target
# 2. 立刻关 GSP 重新加载驱动
# sudo modprobe -r nvidia_drm nvidia_modeset nvidia nvidia_uvm
# sudo modprobe nvidia NVreg_EnableGpuFirmware=0
# sudo nvidia-smi
# 3. 若 8 卡出现 → 就是 GSP 问题,长期生效:
# echo "options nvidia NVreg_EnableGpuFirmware=0" | sudo tee /etc/modprobe.d/nvidia-disable-gsp.conf
# sudo update-initramfs -u
# V2.
# 1. 看是不是 BAR 空间不足
# sudo dmesg | grep -i "BAR 0\|resource 0" | grep 04:06.0 # TODO 变为对应的显卡号
# [ 4.747643] pci 0000:04:06.0: BAR 0 [mem 0xea000000-0xeaffffff]
# 内核已经成功为 04:06.0 分配了 BAR 0大小 16 MiB没有报 “cant allocate” 或 “failed”因此 BAR 空间不足/Above 4G Decoding 问题可以排除
# V3.
# 彻底冷复位
# 宿主机或 云控制台 → 断电 10 秒再上电
#