Moves setup files into embedded package.
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
# NVIDIA Device Plugin
|
||||
|
||||
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
|
||||
|
||||
## Overview
|
||||
|
||||
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
|
||||
- Discovers NVIDIA GPUs on worker nodes
|
||||
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
|
||||
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
|
||||
- Enables pods to request GPU resources
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before installing the NVIDIA Device Plugin, ensure that:
|
||||
|
||||
1. **NVIDIA Drivers** are installed (>= 384.81)
|
||||
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
|
||||
3. **nvidia-container-runtime** is configured as the default container runtime
|
||||
4. Worker nodes have NVIDIA GPUs
|
||||
|
||||
### Talos Linux Requirements
|
||||
|
||||
For Talos Linux nodes, you need:
|
||||
- NVIDIA drivers extension in the Talos schematic
|
||||
- nvidia-container-toolkit extension
|
||||
- Proper container runtime configuration
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Configure and install the service
|
||||
wild-cluster-services-configure nvidia-device-plugin
|
||||
wild-cluster-install nvidia-device-plugin
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
After installation, verify the plugin is working:
|
||||
|
||||
```bash
|
||||
# Check plugin pods are running
|
||||
kubectl get pods -n kube-system | grep nvidia
|
||||
|
||||
# Verify GPU resources are advertised
|
||||
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
|
||||
|
||||
# Check GPU node labels
|
||||
kubectl get nodes --show-labels | grep nvidia
|
||||
```
|
||||
|
||||
## Usage in Applications
|
||||
|
||||
Once installed, applications can request GPU resources:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gpu-app
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: nvidia/cuda:latest
|
||||
resources:
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Plugin Not Starting
|
||||
- Verify NVIDIA drivers are installed on worker nodes
|
||||
- Check that nvidia-container-toolkit is properly configured
|
||||
- Ensure worker nodes are not tainted in a way that prevents scheduling
|
||||
|
||||
### No GPU Resources Advertised
|
||||
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
|
||||
- Verify NVIDIA runtime is the default container runtime
|
||||
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
|
||||
|
||||
## Configuration
|
||||
|
||||
The plugin uses the following configuration:
|
||||
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
|
||||
- **Namespace**: `kube-system`
|
||||
- **Priority Class**: `system-node-critical`
|
||||
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
|
||||
|
||||
## References
|
||||
|
||||
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
|
||||
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
|
||||
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)
|
||||
66
internal/setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
66
internal/setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
|
||||
|
||||
echo "🎮 === Setting up NVIDIA Device Plugin ==="
|
||||
echo ""
|
||||
|
||||
# Check if we have NVIDIA GPUs in the cluster
|
||||
echo "🔍 Checking for worker nodes in the cluster..."
|
||||
|
||||
# Check if any worker nodes exist (device plugin only runs on worker nodes)
|
||||
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
|
||||
if [ "$WORKER_NODES" -eq 0 ]; then
|
||||
echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Found $WORKER_NODES worker node(s)"
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
|
||||
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying NVIDIA Device Plugin..."
|
||||
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
|
||||
|
||||
echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
|
||||
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
|
||||
|
||||
echo ""
|
||||
echo "✅ NVIDIA Device Plugin installed successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n kube-system | grep nvidia"
|
||||
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
|
||||
echo ""
|
||||
echo "🎮 GPU nodes should now be labeled with GPU product information:"
|
||||
echo " kubectl get nodes --show-labels | grep nvidia"
|
||||
echo ""
|
||||
@@ -0,0 +1,91 @@
|
||||
# NVIDIA Device Plugin DaemonSet
|
||||
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
|
||||
# Licensed under the Apache License, Version 2.0
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin-daemonset
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: nvidia-device-plugin-ds
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
spec:
|
||||
runtimeClassName: nvidia
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: feature.node.kubernetes.io/pci-0300_10de.present
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
|
||||
name: nvidia-device-plugin-ctr
|
||||
env:
|
||||
- name: MPS_ROOT
|
||||
value: /run/nvidia/mps
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: compute,utility
|
||||
- name: FAIL_ON_INIT_ERROR
|
||||
value: "false"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: mps-shm
|
||||
mountPath: /dev/shm
|
||||
- name: mps-root
|
||||
mountPath: /mps
|
||||
- name: cdi-root
|
||||
mountPath: /var/run/cdi
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: mps-root
|
||||
hostPath:
|
||||
path: /run/nvidia/mps
|
||||
type: DirectoryOrCreate
|
||||
- name: mps-shm
|
||||
hostPath:
|
||||
path: /run/nvidia/mps/shm
|
||||
- name: cdi-root
|
||||
hostPath:
|
||||
path: /var/run/cdi
|
||||
type: DirectoryOrCreate
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: kube-system
|
||||
resources:
|
||||
- daemonset.yaml
|
||||
- runtimeclass.yaml
|
||||
labels:
|
||||
- pairs:
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia
|
||||
handler: nvidia
|
||||
@@ -0,0 +1,7 @@
|
||||
name: nvidia-device-plugin
|
||||
description: NVIDIA device plugin for Kubernetes
|
||||
namespace: nvidia-device-plugin
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- node-feature-discovery
|
||||
Reference in New Issue
Block a user