Moves setup files into embedded package.

2025-10-11 22:06:39 +00:00
parent 92032202f4
commit 89c6a7aa80
112 changed files with 337 additions and 0 deletions
--- a/internal/setup/cluster-services/nvidia-device-plugin/README.md
+++ b/internal/setup/cluster-services/nvidia-device-plugin/README.md
@@ -0,0 +1,98 @@
+# NVIDIA Device Plugin
+
+The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
+
+## Overview
+
+This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
+- Discovers NVIDIA GPUs on worker nodes
+- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
+- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
+- Enables pods to request GPU resources
+
+## Prerequisites
+
+Before installing the NVIDIA Device Plugin, ensure that:
+
+1. **NVIDIA Drivers** are installed (>= 384.81)
+2. **nvidia-container-toolkit** is installed (>= 1.7.0)
+3. **nvidia-container-runtime** is configured as the default container runtime
+4. Worker nodes have NVIDIA GPUs
+
+### Talos Linux Requirements
+
+For Talos Linux nodes, you need:
+- NVIDIA drivers extension in the Talos schematic
+- nvidia-container-toolkit extension
+- Proper container runtime configuration
+
+## Installation
+
+```bash
+# Configure and install the service
+wild-cluster-services-configure nvidia-device-plugin
+wild-cluster-install nvidia-device-plugin
+```
+
+## Verification
+
+After installation, verify the plugin is working:
+
+```bash
+# Check plugin pods are running
+kubectl get pods -n kube-system | grep nvidia
+
+# Verify GPU resources are advertised
+kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
+
+# Check GPU node labels
+kubectl get nodes --show-labels | grep nvidia
+```
+
+## Usage in Applications
+
+Once installed, applications can request GPU resources:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-app
+spec:
+  template:
+    spec:
+      containers:
+      - name: app
+        image: nvidia/cuda:latest
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+```
+
+## Troubleshooting
+
+### Plugin Not Starting
+- Verify NVIDIA drivers are installed on worker nodes
+- Check that nvidia-container-toolkit is properly configured
+- Ensure worker nodes are not tainted in a way that prevents scheduling
+
+### No GPU Resources Advertised
+- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
+- Verify NVIDIA runtime is the default container runtime
+- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
+
+## Configuration
+
+The plugin uses the following configuration:
+- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
+- **Namespace**: `kube-system`
+- **Priority Class**: `system-node-critical`
+- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
+
+## References
+
+- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
+- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
+- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)
--- a/internal/setup/cluster-services/nvidia-device-plugin/install.sh
+++ b/internal/setup/cluster-services/nvidia-device-plugin/install.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
+
+echo "🎮 === Setting up NVIDIA Device Plugin ==="
+echo ""
+
+# Check if we have NVIDIA GPUs in the cluster
+echo "🔍 Checking for worker nodes in the cluster..."
+
+# Check if any worker nodes exist (device plugin only runs on worker nodes)
+WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
+if [ "$WORKER_NODES" -eq 0 ]; then
+    echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
+    exit 1
+fi
+
+echo "✅ Found $WORKER_NODES worker node(s)"
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
+if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Deploying NVIDIA Device Plugin..."
+kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
+
+echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
+kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
+
+echo ""
+echo "✅ NVIDIA Device Plugin installed successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n kube-system | grep nvidia"
+echo "  kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
+echo ""
+echo "🎮 GPU nodes should now be labeled with GPU product information:"
+echo "  kubectl get nodes --show-labels | grep nvidia"
+echo ""
--- a/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml
@@ -0,0 +1,91 @@
+# NVIDIA Device Plugin DaemonSet
+# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
+# Licensed under the Apache License, Version 2.0
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: nvidia-device-plugin
+    app.kubernetes.io/component: device-plugin
+    managedBy: kustomize
+    partOf: wild-cloud
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-ds
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-ds
+        app.kubernetes.io/name: nvidia-device-plugin
+        app.kubernetes.io/component: device-plugin
+    spec:
+      runtimeClassName: nvidia
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      - key: CriticalAddonsOnly
+        operator: Exists
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0300_10de.present
+                operator: In
+                values:
+                - "true"
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      securityContext:
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: MPS_ROOT
+            value: /run/nvidia/mps
+          - name: NVIDIA_VISIBLE_DEVICES
+            value: all
+          - name: NVIDIA_DRIVER_CAPABILITIES
+            value: compute,utility
+          - name: FAIL_ON_INIT_ERROR
+            value: "false"
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+        - name: mps-shm
+          mountPath: /dev/shm
+        - name: mps-root
+          mountPath: /mps
+        - name: cdi-root
+          mountPath: /var/run/cdi
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
+      - name: mps-root
+        hostPath:
+          path: /run/nvidia/mps
+          type: DirectoryOrCreate
+      - name: mps-shm
+        hostPath:
+          path: /run/nvidia/mps/shm
+      - name: cdi-root
+        hostPath:
+          path: /var/run/cdi
+          type: DirectoryOrCreate
--- a/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml
@@ -0,0 +1,12 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: kube-system
+resources:
+  - daemonset.yaml
+  - runtimeclass.yaml
+labels:
+  - pairs:
+      app.kubernetes.io/name: nvidia-device-plugin
+      app.kubernetes.io/component: device-plugin
+      managedBy: kustomize
+      partOf: wild-cloud
--- a/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml
@@ -0,0 +1,5 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
--- a/internal/setup/cluster-services/nvidia-device-plugin/wild-manifest.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/wild-manifest.yaml
@@ -0,0 +1,7 @@
+name: nvidia-device-plugin
+description: NVIDIA device plugin for Kubernetes
+namespace: nvidia-device-plugin
+category: infrastructure
+
+dependencies:
+  - node-feature-discovery