Skip to content

nvml init failed: ERROR_LIBRARY_NOT_FOUND error after upgrading from 0.15.1 to 0.16.x #856

@ghost

Description

1. Quick Debug Information

  • OS/Version: Ubuntu 22.04
  • Kernel Version: 5.15.0-112-generic
  • Container Runtime Type/Version: containerd
  • K8s Flavor/Version: k3s v1.29.5+k3s1

2. Issue or feature description

After upgrading nvidia-device-plugin from 0.15.1 to 0.16.1 -- nvdp-nvidia-device-plugin-<XYZ> pod keeps CrashLoopBackOff due to error starting plugins: error creating plugin manager: unable to create plugin manager: nvml init failed: ERROR_LIBRARY_NOT_FOUND (see more logs below)

  1. 0.15.1 was originally installed with these flags
helm upgrade --install nvdp nvdp/nvidia-device-plugin \
  --namespace nvidia-device-plugin \
  --create-namespace \
  --version 0.15.1 \
  --set runtimeClassName="nvidia" \
  --set deviceListStrategy=volume-mounts
  1. Upgraded nvidia-container-toolkit & nvidia-container-toolkit-base from 0.15.0 to 0.16.1
apt -y install nvidia-container-toolkit nvidia-container-toolkit-base
  1. Upgraded nvidia-device-plugin to 0.16.1

NEW: deviceDiscoveryStrategy set to nvml
UPDATE/FIX: it is not necessary to set deviceDiscoveryStrategy, as it turned out later (see comments below) - only SYS_ADMIN capability was needed.

helm upgrade --install nvdp nvdp/nvidia-device-plugin \
  --namespace nvidia-device-plugin \
  --create-namespace \
  --version 0.16.1 \
  --set runtimeClassName="nvidia" \
  --set deviceListStrategy=volume-mounts \
  --set deviceDiscoveryStrategy=nvml

Additional configs/details

  • GPUs are V100's and nvidia driver is v555
root@ubuntu-63-222-125-248:~# nvidia-smi -L
GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-bcd43393-58ae-8908-d37b-ed8bdb5477d0)
GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-b4b9f6f2-6b7f-6feb-8d7a-1c7ff581ebca)
GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-1cf02bb5-8101-4795-3f0a-332fbdfc5b0c)
GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-cafd23bc-3339-7a84-3431-809387a71d12)
GPU 4: Tesla V100-SXM2-16GB (UUID: GPU-5ed25a6d-bd41-3884-9465-e60db94cb0a4)
GPU 5: Tesla V100-SXM2-16GB (UUID: GPU-4fb368b6-9fab-2cd9-165e-32bb045c9f42)
GPU 6: Tesla V100-SXM2-16GB (UUID: GPU-610da736-1258-1353-26e3-42aa08d654e3)
GPU 7: Tesla V100-SXM2-16GB (UUID: GPU-09090338-f844-7bd0-ea7b-d1377db4015c)

root@ubuntu-63-222-125-248:~# nvidia-smi  | head -4
Thu Aug  1 15:51:11 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.52.04              Driver Version: 555.52.04      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
root@ubuntu-63-222-125-248:~# 
  • nvidia-container-runtime config
# cat /etc/nvidia-container-runtime/config.toml 
accept-nvidia-visible-devices-as-volume-mounts = true
accept-nvidia-visible-devices-envvar-when-unprivileged = false
disable-require = false
supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video"
#swarm-resource = "DOCKER_RESOURCE_GPU"

[nvidia-container-cli]
#debug = "/var/log/nvidia-container-toolkit.log"
environment = []
#ldcache = "/etc/ld.so.cache"
ldconfig = "@/sbin/ldconfig.real"
load-kmods = true
#no-cgroups = false
#path = "/usr/bin/nvidia-container-cli"
#root = "/run/nvidia/driver"
#user = "root:video"

[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
mode = "auto"
runtimes = ["docker-runc", "runc", "crun"]

[nvidia-container-runtime.modes]

[nvidia-container-runtime.modes.cdi]
annotation-prefixes = ["cdi.k8s.io/"]
default-kind = "nvidia.com/gpu"
spec-dirs = ["/etc/cdi", "/var/run/cdi"]

[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false

[nvidia-ctk]
path = "nvidia-ctk"
  • nvidia / cuda packages installed (no cuda packages are installed) on the host
# dpkg -l |grep nvidia
ii  libnvidia-cfg1-555:amd64                    555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA binary OpenGL/GLX configuration library
ii  libnvidia-common-555                        555.52.04-0ubuntu0~gpu22.04.1           all          Shared files used by the NVIDIA libraries
rc  libnvidia-compute-535:amd64                 535.171.04-0ubuntu0.22.04.1             amd64        NVIDIA libcompute package
rc  libnvidia-compute-535-server:amd64          535.161.08-0ubuntu2.22.04.1             amd64        NVIDIA libcompute package
ii  libnvidia-compute-555:amd64                 555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA libcompute package
ii  libnvidia-container-tools                   1.16.1-1                                amd64        NVIDIA container runtime library (command-line tools)
ii  libnvidia-container1:amd64                  1.16.1-1                                amd64        NVIDIA container runtime library
ii  libnvidia-decode-555:amd64                  555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA Video Decoding runtime libraries
ii  libnvidia-encode-555:amd64                  555.52.04-0ubuntu0~gpu22.04.1           amd64        NVENC Video Encoding runtime library
ii  libnvidia-extra-555:amd64                   555.52.04-0ubuntu0~gpu22.04.1           amd64        Extra libraries for the NVIDIA driver
ii  libnvidia-fbc1-555:amd64                    555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA OpenGL-based Framebuffer Capture runtime library
ii  libnvidia-gl-555:amd64                      555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA OpenGL/GLX/EGL/GLES GLVND libraries and Vulkan ICD
rc  linux-objects-nvidia-535-5.15.0-107-generic 5.15.0-107.117                          amd64        Linux kernel nvidia modules for version 5.15.0-107 (objects)
ii  linux-objects-nvidia-535-5.15.0-112-generic 5.15.0-112.122+1                        amd64        Linux kernel nvidia modules for version 5.15.0-112 (objects)
ii  linux-signatures-nvidia-5.15.0-112-generic  5.15.0-112.122+1                        amd64        Linux kernel signatures for nvidia modules for version 5.15.0-112-generic
ii  nvidia-compute-utils-555                    555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA compute utilities
ii  nvidia-container-runtime                    3.14.0-1                                all          NVIDIA Container Toolkit meta-package
ii  nvidia-container-toolkit                    1.16.1-1                                amd64        NVIDIA Container toolkit
ii  nvidia-container-toolkit-base               1.16.1-1                                amd64        NVIDIA Container Toolkit Base
ii  nvidia-dkms-555                             555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA DKMS package
ii  nvidia-driver-555                           555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA driver metapackage
ii  nvidia-firmware-555-555.52.04               555.52.04-0ubuntu0~gpu22.04.1           amd64        Firmware files used by the kernel module
ii  nvidia-kernel-common-555                    555.52.04-0ubuntu0~gpu22.04.1           amd64        Shared files used with the kernel module
ii  nvidia-kernel-source-555                    555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA kernel source package
ii  nvidia-prime                                0.8.17.1                                all          Tools to enable NVIDIA's Prime
ii  nvidia-settings                             510.47.03-0ubuntu1                      amd64        Tool for configuring the NVIDIA graphics driver
ii  nvidia-utils-555                            555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA driver support binaries
ii  screen-resolution-extra                     0.18.2                                  all          Extension for the nvidia-settings control panel
ii  xserver-xorg-video-nvidia-555               555.52.04-0ubuntu0~gpu22.04.1           amd64        NVIDIA binary Xorg driver
# dpkg -l |grep cuda
# 

Logs

0.15.1 - before upgrading to 0.16.1

$ kubectl -n nvidia-device-plugin logs ds/nvdp-nvidia-device-plugin
I0801 15:26:24.778996       1 main.go:178] Starting FS watcher.
I0801 15:26:24.779113       1 main.go:185] Starting OS watcher.
I0801 15:26:24.779348       1 main.go:200] Starting Plugins.
I0801 15:26:24.779362       1 main.go:257] Loading configuration.
I0801 15:26:24.780045       1 main.go:265] Updating config with default resource matching patterns.
I0801 15:26:24.780297       1 main.go:276] 
Running with config:
{
  "version": "v1",
  "flags": {
    "migStrategy": "none",
    "failOnInitError": true,
    "mpsRoot": "/run/nvidia/mps",
    "nvidiaDriverRoot": "/",
    "gdsEnabled": false,
    "mofedEnabled": false,
    "useNodeFeatureAPI": null,
    "plugin": {
      "passDeviceSpecs": false,
      "deviceListStrategy": [
        "volume-mounts"
      ],
      "deviceIDStrategy": "uuid",
      "cdiAnnotationPrefix": "cdi.k8s.io/",
      "nvidiaCTKPath": "/usr/bin/nvidia-ctk",
      "containerDriverRoot": "/driver-root"
    }
  },
  "resources": {
    "gpus": [
      {
        "pattern": "*",
        "name": "nvidia.com/gpu"
      }
    ]
  },
  "sharing": {
    "timeSlicing": {}
  }
}
I0801 15:26:24.780307       1 main.go:279] Retrieving plugins.
I0801 15:26:24.781053       1 factory.go:104] Detected NVML platform: found NVML library
I0801 15:26:24.781096       1 factory.go:104] Detected non-Tegra platform: /sys/devices/soc0/family file not found
I0801 15:26:24.870844       1 server.go:216] Starting GRPC server for 'nvidia.com/gpu'
I0801 15:26:24.872194       1 server.go:147] Starting to serve 'nvidia.com/gpu' on /var/lib/kubelet/device-plugins/nvidia-gpu.sock
I0801 15:26:24.874758       1 server.go:154] Registered device plugin for 'nvidia.com/gpu' with Kubelet

After upgrading to 0.16.1

$ kubectl -n nvidia-device-plugin logs ds/nvdp-nvidia-device-plugin
I0801 15:27:17.495707       1 main.go:199] Starting FS watcher.
I0801 15:27:17.495790       1 main.go:206] Starting OS watcher.
I0801 15:27:17.496048       1 main.go:221] Starting Plugins.
I0801 15:27:17.496068       1 main.go:278] Loading configuration.
I0801 15:27:17.496692       1 main.go:303] Updating config with default resource matching patterns.
I0801 15:27:17.496967       1 main.go:314] 
Running with config:
{
  "version": "v1",
  "flags": {
    "migStrategy": "none",
    "failOnInitError": true,
    "mpsRoot": "/run/nvidia/mps",
    "nvidiaDriverRoot": "/",
    "nvidiaDevRoot": "/",
    "gdsEnabled": false,
    "mofedEnabled": false,
    "useNodeFeatureAPI": null,
    "deviceDiscoveryStrategy": "nvml",
    "plugin": {
      "passDeviceSpecs": false,
      "deviceListStrategy": [
        "volume-mounts"
      ],
      "deviceIDStrategy": "uuid",
      "cdiAnnotationPrefix": "cdi.k8s.io/",
      "nvidiaCTKPath": "/usr/bin/nvidia-ctk",
      "containerDriverRoot": "/driver-root"
    }
  },
  "resources": {
    "gpus": [
      {
        "pattern": "*",
        "name": "nvidia.com/gpu"
      }
    ]
  },
  "sharing": {
    "timeSlicing": {}
  }
}
I0801 15:27:17.496986       1 main.go:317] Retrieving plugins.
E0801 15:27:17.497057       1 factory.go:68] Failed to initialize NVML: ERROR_LIBRARY_NOT_FOUND.
E0801 15:27:17.497079       1 factory.go:69] If this is a GPU node, did you set the docker default runtime to `nvidia`?
E0801 15:27:17.497085       1 factory.go:70] You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
E0801 15:27:17.497090       1 factory.go:71] You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
E0801 15:27:17.497095       1 factory.go:72] If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
E0801 15:27:17.511338       1 main.go:149] error starting plugins: error creating plugin manager: unable to create plugin manager: nvml init failed: ERROR_LIBRARY_NOT_FOUND

Additional thoughts

Whenever I'm rolling back to 0.15.1 -- it is working just well.

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions