Files
wild-directory/nvidia-device-plugin/install.sh
Paul Payne 9687fad812 feat: Move cluster services to wild-directory as unified packages
Convert all 15 cluster services from embedded API format to
wild-directory packages using the unified manifest format:
- metallb, traefik, cert-manager, longhorn, snapshot-controller
- nfs, smtp, coredns, node-feature-discovery, nvidia-device-plugin
- externaldns, docker-registry, headlamp, crowdsec, utils

Changes:
- wild-manifest.yaml → manifest.yaml with is, defaultConfig, requires
- Eliminated configReferences and serviceConfig fields
- Flattened kustomize.template/ to package root
- Template vars use flat defaultConfig keys
- install.sh paths updated for apps/ layout
- Updated 9 app manifests: cloud.smtp.* → apps.smtp.* with requires
- Removed dead install: true field from 6 app manifests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-17 02:26:46 +00:00

66 lines
2.0 KiB
Bash
Executable File

#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_API_DATA_DIR is set
if [ -z "${WILD_API_DATA_DIR}" ]; then
echo "❌ ERROR: WILD_API_DATA_DIR is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_API_DATA_DIR}/instances/${WILD_INSTANCE}"
NVIDIA_PLUGIN_DIR="${INSTANCE_DIR}/apps/nvidia-device-plugin"
echo "🎮 === Setting up NVIDIA Device Plugin ==="
echo ""
# Check if we have NVIDIA GPUs in the cluster
echo "🔍 Checking for worker nodes in the cluster..."
# Check if any worker nodes exist (device plugin only runs on worker nodes)
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
if [ "$WORKER_NODES" -eq 0 ]; then
echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
exit 1
fi
echo "✅ Found $WORKER_NODES worker node(s)"
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
if [ ! -f "${NVIDIA_PLUGIN_DIR}/kustomization.yaml" ]; then
echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomization.yaml"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying NVIDIA Device Plugin..."
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/
echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
echo ""
echo "✅ NVIDIA Device Plugin installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n kube-system | grep nvidia"
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
echo ""
echo "🎮 GPU nodes should now be labeled with GPU product information:"
echo " kubectl get nodes --show-labels | grep nvidia"
echo ""