openStack配置GPU直通(zed)
//
# openStack配置GPU直通(zed)
zed集群部署方式这里不做描述,通过官方文档步骤部署
# 系统检查(检查支持gpu环境)
系统描述
openstack Zed 系统:Ubuntu 22.04.4 LTS intel-CPU: 10400 显卡:1050ti 查看是否支持 iommu
dmesg 查看 cpu 、GPU 是否支持虚拟化,
dmesg | grep -i "iommu"
查看内核iommu: cat /proc/cmdline
查看显卡:lspci -nn | grep -i nvidia
dmesg 日志是会滚动的,日志存储路径: /var/log/dmesg 临时的,所以可能会看不到
# 内核配置grub 支持iommu
root@controller:~# grep -Ev "^$|#" /etc/default/grub
GRUB_DEFAULT=0
GRUB_TIMEOUT_STYLE=hidden
GRUB_TIMEOUT=0
GRUB_DISTRIBUTOR=`lsb_release -i -s 2> /dev/null || echo Debian`
GRUB_CMDLINE_LINUX="intel_iommu=on crashkernel=auto rhgb quiet"
#开启 amd_iommu=on
GRUB_CMDLINE_LINUX_DEFAULT="loglevel=3 amd_iommu=on"
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
- 更新grub
grub-mkconfig -o /boot/grub/grub.cfg
如果是efi安装的系统,那么使用如下更新grub
grub-mkconfig -o /boot/efi/EFI/ubuntu/grub.cfg
reboot
检查
记住这几个值,后面配置文件需要用到 [10de:1c82] [10de:0fb9]
root@controller:~# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-5.15.0-112-generic root=/dev/mapper/ubuntu--vg-ubuntu--lv ro intel_iommu=on crashkernel=auto rhgb quiet loglevel=3 amd_iommu=on
yfk@controller:~# lspci -nn | grep -i nvidia
01:00.0 VGA compatible controller [0300]: NVIDIA Corporation GP107 [GeForce GTX 1050 Ti] [10de:1c82] (rev a1)
01:00.1 Audio device [0403]: NVIDIA Corporation GP107GL High Definition Audio Controller [10de:0fb9] (rev a1)
yfk@controller:~# dmesg | grep -i "iommu"
1
2
3
4
5
6
7
8
9
10
2
3
4
5
6
7
8
9
10
# 配置vfio模块
- modprobe配置(新增)
cat >>/etc/modprobe.d/blacklist.conf<<EOF
blacklist nouveau
options nouveau modeset=0
blacklist xhci_hcd
blacklist nvidia
blacklist snd_hda_intel
blacklist nvidiafb
EOF
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
echo 'options vfio-pci ids=10de:1c82,10de:0fb9' >> /etc/modprobe.d/vfio.conf
- 配置系统加载模块(新增)
cat >/etc/modules-load.d/openstack-gpu.conf<<EOF
vfio_pci
pci_stub
vfio
vfio_iommu_type1
kvm
kvm_intel
EOF
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
重启系统: reboot
#wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
#dpkg -i cuda-keyring_1.1-1_all.deb
#apt install nvidia-cuda-toolkit
#apt-get install -y cuda-drivers-fabricmanager-535
1
2
3
4
2
3
4
# 挂载VFIO驱动
echo 10de 1c82 > /sys/bus/pci/drivers/vfio-pci/new_id
echo 10de 0fb9 > /sys/bus/pci/drivers/vfio-pci/new_id
lspci -nnk -d 10de:1c82
lspci -nnk -d 10de:0fb9
1
2
3
4
5
2
3
4
5
# openstack配置gpu配置文件
# controller控制节点
- nova-scheduler 新增
yfk@controller:~# vim /etc/nova/nova.conf
[scheduler]
scheduler_driver=nova.scheduler.filter_scheduler.FilterScheduler
scheduler_available_filters=nova.scheduler.filters.all_filters
scheduler_available_filters=nova.scheduler.filters.pci_passthrough_filter.PciPassthroughFilter
scheduler_default_filters=RamFilter,ComputeFilter,AvailabilityZoneFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,PciPassthroughFilter
service nova-scheduler restart
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
- nova-api 新增
yfk@controller:~# vim /etc/nova/nova.conf
[pci]
alias = { "name": "nvidia1050vga", "product_id": "1c82", "vendor_id": "10de", "device_type": "type-PCI" }
alias = { "name": "nvidia1050aud", "product_id": "0fb9", "vendor_id": "10de", "device_type": "type-PCI" }
service nova-api restart
1
2
3
4
5
6
2
3
4
5
6
# compute计算节点
- nova-compute 新增
注意和上面lcpci获取的id对应
yfk@controller:~# vim /etc/nova/nova.conf
[pci]
alias = { "name": "nvidia1050vga", "product_id": "1c82", "vendor_id": "10de", "device_type": "type-PCI" }
alias = { "name": "nvidia1050aud", "product_id": "0fb9", "vendor_id": "10de", "device_type": "type-PCI" }
passthrough_whitelist = [{ "vendor_id": "10de", "product_id": "1c82" },{ "vendor_id": "10de", "product_id": "0fb9" }]
service nova-compute restart
1
2
3
4
5
6
7
2
3
4
5
6
7
# 配置实例类型
- 创建实例类型
openstack flavor create C2-2G-5G-gpu --ram 2048 --disk 5 --vcpus 2 --public
openstack flavor set C2-2G-5G --property "pci_passthrough:alias"="nvidia1050vga:1,nvidia1050aud:1"
openstack flavor set C2-2G-5G-gpu --property "pci_passthrough:alias"="nvidia1050aud:1"
openstack flavor create --id c2-4G-20G-gpu1050 --vcpus 4 --ram 4096 --disk 20 --property "pci_passthrough:alias"="nvidia1050vga:1,nvidia1050aud:1" gpu1.small
openstack flavor list
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
- 镜像处理,对显卡驱动隐藏hypervisor id
openstack image list
IMG_UUID=$(openstack image list |grep ubuntu2204 |awk '{print $2}')
openstack image set ${IMG_UUID} --property img_hide_hypervisor_id=true
1
2
3
2
3
- 创建实例类型
openstack server create --flavor C2-2G-5G-gpu --image ubuntu2204 --security-group 49d45943-a9f8-4b87-ba40-b9ff4340ab57 --nic net-id=977b2ea4-d178-41fd-999c-5e0472b71bf2 Ubuntu-2204GPU-22
1
2
2
--flavor 实例类型
--image 镜像名
--nic 指定子网id
--security-group 指定安全组
# ubuntu2204登录初始化密码
#!/bin/bash
echo 'root:asdfqwer1234' | chpasswd
echo "UseDNS no" >> /etc/ssh/sshd_config
echo "PermitRootLogin yes" >> /etc/ssh/sshd_config
echo "PubkeyAcceptedAlgorithms=+ssh-rsa" >> /etc/ssh/sshd_config
sed -i 's#PasswordAuthentication no#PasswordAuthentication yes#g' /etc/ssh/sshd_config.d/60-cloudimg-settings.conf
service ssh restart
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
ssh 用户密码:root/asdfqwer1234
//
如果此文章对您有帮助,点击 -->> 请博主喝咖啡 (opens new window)
上次更新: 2024/11/20, 10:16:50