Moves setup files into embedded package.

This commit is contained in:
2025-10-11 22:06:39 +00:00
parent 92032202f4
commit 89c6a7aa80
112 changed files with 337 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
# NVIDIA Device Plugin
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
## Overview
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
- Discovers NVIDIA GPUs on worker nodes
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
- Enables pods to request GPU resources
## Prerequisites
Before installing the NVIDIA Device Plugin, ensure that:
1. **NVIDIA Drivers** are installed (>= 384.81)
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
3. **nvidia-container-runtime** is configured as the default container runtime
4. Worker nodes have NVIDIA GPUs
### Talos Linux Requirements
For Talos Linux nodes, you need:
- NVIDIA drivers extension in the Talos schematic
- nvidia-container-toolkit extension
- Proper container runtime configuration
## Installation
```bash
# Configure and install the service
wild-cluster-services-configure nvidia-device-plugin
wild-cluster-install nvidia-device-plugin
```
## Verification
After installation, verify the plugin is working:
```bash
# Check plugin pods are running
kubectl get pods -n kube-system | grep nvidia
# Verify GPU resources are advertised
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
# Check GPU node labels
kubectl get nodes --show-labels | grep nvidia
```
## Usage in Applications
Once installed, applications can request GPU resources:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-app
spec:
template:
spec:
containers:
- name: app
image: nvidia/cuda:latest
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1
```
## Troubleshooting
### Plugin Not Starting
- Verify NVIDIA drivers are installed on worker nodes
- Check that nvidia-container-toolkit is properly configured
- Ensure worker nodes are not tainted in a way that prevents scheduling
### No GPU Resources Advertised
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
- Verify NVIDIA runtime is the default container runtime
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
## Configuration
The plugin uses the following configuration:
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
- **Namespace**: `kube-system`
- **Priority Class**: `system-node-critical`
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
## References
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)

View File

@@ -0,0 +1,66 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
echo "🎮 === Setting up NVIDIA Device Plugin ==="
echo ""
# Check if we have NVIDIA GPUs in the cluster
echo "🔍 Checking for worker nodes in the cluster..."
# Check if any worker nodes exist (device plugin only runs on worker nodes)
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
if [ "$WORKER_NODES" -eq 0 ]; then
echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
exit 1
fi
echo "✅ Found $WORKER_NODES worker node(s)"
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying NVIDIA Device Plugin..."
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
echo ""
echo "✅ NVIDIA Device Plugin installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n kube-system | grep nvidia"
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
echo ""
echo "🎮 GPU nodes should now be labeled with GPU product information:"
echo " kubectl get nodes --show-labels | grep nvidia"
echo ""

View File

@@ -0,0 +1,91 @@
# NVIDIA Device Plugin DaemonSet
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
# Licensed under the Apache License, Version 2.0
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
labels:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
managedBy: kustomize
partOf: wild-cloud
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
spec:
runtimeClassName: nvidia
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: CriticalAddonsOnly
operator: Exists
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: feature.node.kubernetes.io/pci-0300_10de.present
operator: In
values:
- "true"
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
name: nvidia-device-plugin-ctr
env:
- name: MPS_ROOT
value: /run/nvidia/mps
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: mps-shm
mountPath: /dev/shm
- name: mps-root
mountPath: /mps
- name: cdi-root
mountPath: /var/run/cdi
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: mps-root
hostPath:
path: /run/nvidia/mps
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /run/nvidia/mps/shm
- name: cdi-root
hostPath:
path: /var/run/cdi
type: DirectoryOrCreate

View File

@@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kube-system
resources:
- daemonset.yaml
- runtimeclass.yaml
labels:
- pairs:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
managedBy: kustomize
partOf: wild-cloud

View File

@@ -0,0 +1,5 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia

View File

@@ -0,0 +1,7 @@
name: nvidia-device-plugin
description: NVIDIA device plugin for Kubernetes
namespace: nvidia-device-plugin
category: infrastructure
dependencies:
- node-feature-discovery