Rex-GPU
view release on metacpan or search on metacpan
lib/Rex/GPU/NVIDIA.pm view on Meta::CPAN
if ($major >= 10) {
push @packages, "kmod-nvidia-open-dkms", "nvidia-driver", "nvidia-driver-cuda";
}
else {
run "dnf module enable nvidia-driver:open-dkms -y 2>/dev/null || true", auto_die => 0;
push @packages, "nvidia-open";
}
Rex::Logger::info(" Installing: " . join(", ", @packages));
# Use run() directly: Rex::Pkg::Dnf fails when dnf exits non-zero due to
# DKMS post-install scripts (kernel module build). Verify via rpm -q instead.
my $pkg_str = join(" ", @packages);
run "dnf install -y $pkg_str", auto_die => 0;
my $check = run "rpm -q nvidia-driver 2>&1", auto_die => 0;
die "nvidia-driver not installed after dnf install â check dnf output\n"
if $? != 0;
}
sub _rhel_major_version {
# Rex::Commands::Gather::operating_system_version() strips dots,
# so "10.1" becomes "101". Use operating_system_release() for the raw
# version string and extract the major version ourselves.
my $release = Rex::Commands::Gather::operating_system_release();
$release =~ /^(\d+)/;
return $1 + 0;
}
# ============================================================
# openSUSE Leap
# ============================================================
sub _install_driver_suse {
my ($os, $running_kernel) = @_;
my $version = operating_system_version();
my $major = int($version);
# Remove any stale NVIDIA packages first â avoids kmp/userspace version mismatch
# caused by libnvidia-ml/libnvidia-cfg from the standard OSS non-free repo lagging
# behind the NVIDIA GFX repo packages.
Rex::Logger::info(" Removing any existing NVIDIA packages...");
run q{rpm -e $(rpm -qa | grep -E '^(nvidia|libnvidia)' | grep -v 'container') 2>/dev/null || true},
auto_die => 0;
# Add NVIDIA GFX repo (use direct baseurls â zypper cannot parse yum .repo files)
if ($major >= 16) {
Rex::Logger::info(" Adding NVIDIA GFX repo (suse16)...");
run "zypper rr nvidia-gfx 2>/dev/null || true", auto_die => 0;
run "zypper addrepo --refresh https://download.nvidia.com/opensuse/leap/16.0/ nvidia-gfx 2>/dev/null",
auto_die => 0;
}
else {
my $leap_version = sprintf("%.1f", $version / 10); # 156 -> 15.6
Rex::Logger::info(" Adding NVIDIA GFX repo (opensuse15, Leap $leap_version)...");
run "zypper rr nvidia-gfx 2>/dev/null || true", auto_die => 0;
run "zypper addrepo --refresh https://download.nvidia.com/opensuse/leap/$leap_version/ nvidia-gfx 2>/dev/null",
auto_die => 0;
}
run "zypper --gpg-auto-import-keys refresh nvidia-gfx 2>/dev/null", auto_die => 0;
# Use the meta package â it co-installs kmp-default + userspace at the same version,
# preventing the split that causes "Driver/library version mismatch" with nvidia-smi.
# Pre-signed kmp packages don't need kernel-devel/headers.
my $meta_pkg = $major >= 16
? "nvidia-open-driver-G07-signed-kmp-meta"
: "nvidia-open-driver-G06-signed-kmp-meta";
Rex::Logger::info(" Installing $meta_pkg...");
run "zypper install -y $meta_pkg", auto_die => 0;
# Lock the OSS non-free standalone packages so future zypper updates don't
# pull in a stale libnvidia-ml / libnvidia-cfg and cause a mismatch again.
run "zypper addlock libnvidia-ml libnvidia-cfg 2>/dev/null || true", auto_die => 0;
}
# ============================================================
# Nouveau blacklisting
# ============================================================
sub _blacklist_nouveau {
file "/etc/modprobe.d/blacklist-nouveau.conf",
content => "blacklist nouveau\noptions nouveau modeset=0\n";
if (is_debian()) {
run "update-initramfs -u 2>/dev/null", auto_die => 0;
}
elsif (is_redhat()) {
run "dracut --force 2>/dev/null", auto_die => 0;
}
elsif (is_suse()) {
run "dracut --force 2>/dev/null", auto_die => 0;
}
}
# ============================================================
# Container toolkit installation
# ============================================================
sub _install_toolkit_debian {
pkg ["curl", "gnupg"], ensure => "present";
run "curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null",
auto_die => 0;
file "/etc/apt/sources.list.d/nvidia-container-toolkit.list",
content => 'deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/$(ARCH) /' . "\n";
run "apt-get -o DPkg::Lock::Timeout=120 update -q", auto_die => 0;
# DPkg::Lock::Timeout=120: wait for apt-daily.timer lock that fires after reboot.
run "DEBIAN_FRONTEND=noninteractive apt-get -o DPkg::Lock::Timeout=120 install -y nvidia-container-toolkit", auto_die => 0;
my $check = run "dpkg -l nvidia-container-toolkit 2>/dev/null | grep -q '^ii'", auto_die => 0;
die "nvidia-container-toolkit not installed\n" if $? != 0;
}
sub _install_toolkit_redhat {
run "curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo",
auto_die => 0;
run "dnf clean expire-cache", auto_die => 0;
run "dnf install -y nvidia-container-toolkit", auto_die => 0;
my $check = run "rpm -q nvidia-container-toolkit 2>&1", auto_die => 0;
die "nvidia-container-toolkit not installed\n" if $? != 0;
}
sub _install_toolkit_suse {
# The .repo file URL is yum/dnf format â zypper needs the baseurl directly.
# Remove any stale entry (possibly added with the wrong URL) before re-adding.
my $arch = run "uname -m", auto_die => 0;
chomp $arch;
$arch ||= 'x86_64';
run "zypper rr nvidia-container-toolkit 2>/dev/null || true", auto_die => 0;
run "rpm --import https://nvidia.github.io/libnvidia-container/gpgkey 2>/dev/null",
auto_die => 0;
run "zypper addrepo --refresh https://nvidia.github.io/libnvidia-container/stable/rpm/$arch nvidia-container-toolkit 2>/dev/null",
auto_die => 0;
run "zypper --gpg-auto-import-keys refresh nvidia-container-toolkit 2>/dev/null",
auto_die => 0;
run "zypper install -y nvidia-container-toolkit", auto_die => 0;
}
# ============================================================
# Containerd configuration
# ============================================================
sub _configure_containerd_rke2 {
file "/var/lib/rancher/rke2/agent/etc/containerd", ensure => 'directory';
file "/var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl",
content => "imports = [\"/etc/containerd/conf.d/*.toml\"]\nversion = 2\n";
_write_nvidia_containerd_config();
}
sub _configure_containerd_standalone {
run "nvidia-ctk runtime configure --runtime=containerd 2>&1", auto_die => 0;
run "systemctl restart containerd 2>/dev/null", auto_die => 0;
}
sub _write_nvidia_containerd_config {
file "/etc/containerd/conf.d", ensure => 'directory';
file "/etc/containerd/conf.d/99-nvidia.toml", content => <<'TOML';
version = 2
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".containerd]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
TOML
}
# ============================================================
# CDI spec generation
# ============================================================
sub generate_cdi_specs {
Rex::Logger::info("Generating NVIDIA CDI specs...");
run "mkdir -p /etc/cdi", auto_die => 0;
run "nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml 2>/dev/null", auto_die => 0;
Rex::Logger::info(" [ok] CDI specs written to /etc/cdi/nvidia.yaml");
}
# ============================================================
# Reboot
# ============================================================
sub _reboot_and_wait {
Rex::Logger::info("Rebooting host to activate NVIDIA driver (replacing nouveau)...");
( run in 0.512 second using v1.01-cache-2.11-cpan-df04353d9ac )