Moves setup files into embedded package.

2025-10-11 22:06:39 +00:00
parent 92032202f4
commit 89c6a7aa80
112 changed files with 337 additions and 0 deletions
--- a/internal/setup/README.md
+++ b/internal/setup/README.md
@@ -0,0 +1,15 @@
+# Setup instructions
+
+Install dependencies:
+
+Follow the instructions to [set up a dnsmasq machine](./dnsmasq/README.md).
+
+Follow the instructions to [set up cluster nodes](./cluster-nodes/README.md).
+
+Follow the instruction to set up [cluster services](./cluster-services/README.md).
+
+Now make sure everything works:
+
+```bash
+wild-health
+```
--- a/internal/setup/cluster-nodes/init-cluster.sh
+++ b/internal/setup/cluster-nodes/init-cluster.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Talos cluster initialization script
+# This script performs one-time cluster setup: generates secrets, base configs, and sets up talosctl
+
+set -euo pipefail
+
+# Check if WC_HOME is set
+if [ -z "${WC_HOME:-}" ]; then
+    echo "Error: WC_HOME environment variable not set. Run \`source ./env.sh\`."
+    exit 1
+fi
+
+NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
+
+# Get cluster configuration from config.yaml
+CLUSTER_NAME=$(wild-config cluster.name)
+VIP=$(wild-config cluster.nodes.control.vip)
+TALOS_VERSION=$(wild-config cluster.nodes.talos.version)
+
+echo "Initializing Talos cluster: $CLUSTER_NAME"
+echo "VIP: $VIP"
+echo "Talos version: $TALOS_VERSION"
+
+# Create directories
+mkdir -p generated final patch
+
+# Check if cluster secrets already exist
+if [ -f "generated/secrets.yaml" ]; then
+    echo ""
+    echo "⚠️  Cluster secrets already exist!"
+    echo "This will regenerate ALL cluster certificates and invalidate existing nodes."
+    echo ""
+    read -p "Do you want to continue? (y/N): " -r
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        echo "Cancelled."
+        exit 0
+    fi
+    echo ""
+fi
+
+# Generate fresh cluster secrets
+echo "Generating cluster secrets..."
+cd generated
+talosctl gen secrets -o secrets.yaml --force
+
+echo "Generating base machine configs..."
+talosctl gen config --with-secrets secrets.yaml "$CLUSTER_NAME" "https://$VIP:6443" --force
+cd ..
+
+# Setup talosctl context
+echo "Setting up talosctl context..."
+
+# Remove existing context if it exists
+talosctl config context "$CLUSTER_NAME" --remove 2>/dev/null || true
+
+# Merge new configuration
+talosctl config merge ./generated/talosconfig
+talosctl config endpoint "$VIP"
+
+echo ""
+echo "✅ Cluster initialization complete!"
+echo ""
+echo "Cluster details:"
+echo "  - Name: $CLUSTER_NAME"
+echo "  - VIP: $VIP"
+echo "  - Secrets: generated/secrets.yaml"
+echo "  - Base configs: generated/controlplane.yaml, generated/worker.yaml"
+echo ""
+echo "Talosctl context configured:"
+talosctl config info
+echo ""
+echo "Next steps:"
+echo "1. Register nodes with hardware detection:"
+echo "   ./detect-node-hardware.sh <maintenance-ip> <node-number>"
+echo ""
+echo "2. Generate machine configurations:"
+echo "   ./generate-machine-configs.sh"
+echo ""
+echo "3. Apply configurations to nodes"
--- a/internal/setup/cluster-nodes/patch.templates/controlplane.yaml
+++ b/internal/setup/cluster-nodes/patch.templates/controlplane.yaml
@@ -0,0 +1,23 @@
+machine:
+  install:
+    disk: {{ index .cluster.nodes.active "{{NODE_NAME}}" "disk" }}
+    image: factory.talos.dev/metal-installer/{{SCHEMATIC_ID}}:{{VERSION}}
+  network:
+    hostname: "{{NODE_NAME}}"
+    interfaces:
+      - interface: {{ index .cluster.nodes.active "{{NODE_NAME}}" "interface" }}
+        dhcp: false
+        addresses:
+          - "{{NODE_IP}}/24"
+        routes:
+          - network: 0.0.0.0/0
+            gateway: {{ .cloud.router.ip }}
+        vip:
+          ip: {{ .cluster.nodes.control.vip }}
+# cluster:
+#   discovery:
+#     enabled: true
+#     registries:
+#       service:
+#         disabled: true
+#   allowSchedulingOnControlPlanes: true
--- a/internal/setup/cluster-nodes/patch.templates/worker.yaml
+++ b/internal/setup/cluster-nodes/patch.templates/worker.yaml
@@ -0,0 +1,23 @@
+machine:
+  install:
+    disk: {{ index .cluster.nodes.active "{{NODE_NAME}}" "disk" }}
+    image: factory.talos.dev/metal-installer/{{ .cluster.nodes.talos.schematicId}}:{{ .cluster.nodes.talos.version}}
+  network:
+    hostname: "{{NODE_NAME}}"
+    interfaces:
+      - interface: {{ index .cluster.nodes.active "{{NODE_NAME}}" "interface" }}
+        dhcp: true
+        addresses:
+          - "{{NODE_IP}}/24"
+        routes:
+          - network: 0.0.0.0/0
+            gateway: {{ .cloud.router.ip }}
+  kubelet:
+    extraMounts:
+      - destination: /var/lib/longhorn
+        type: bind
+        source: /var/lib/longhorn
+        options:
+          - bind
+          - rshared
+          - rw
--- a/internal/setup/cluster-nodes/talos-schemas.yaml
+++ b/internal/setup/cluster-nodes/talos-schemas.yaml
@@ -0,0 +1,63 @@
+# Talos Version to Schematic ID Mappings
+#
+# This file contains mappings of Talos versions to their corresponding
+# default schematic IDs for wild-cloud deployments.
+#
+# Schematic IDs are generated from factory.talos.dev and include
+# common system extensions needed for typical hardware.
+#
+# To add new versions:
+# 1. Go to https://factory.talos.dev/
+# 2. Select the system extensions you need
+# 3. Generate the schematic
+# 4. Add the version and schematic ID below
+
+# Format: Each schematic ID is the primary key with version and definition nested  
+  "434a0300db532066f1098e05ac068159371d00f0aba0a3103a0e826e83825c82":
+    schematic:
+      customization:
+        systemExtensions:
+          officialExtensions:
+            - siderolabs/gvisor
+            - siderolabs/intel-ucode
+            - siderolabs/iscsi-tools
+            - siderolabs/util-linux-tools
+  "f309e674d9ad94655e2cf8a43ea1432475c717cd1885f596bd7ec852b900bc5b":
+    schematic:
+      customization:
+        systemExtensions:
+            officialExtensions:
+                - siderolabs/gvisor
+                - siderolabs/intel-ucode
+                - siderolabs/iscsi-tools
+                - siderolabs/nvidia-container-toolkit-lts
+                - siderolabs/nvidia-container-toolkit-production
+                - siderolabs/nvidia-fabricmanager-lts
+                - siderolabs/nvidia-fabricmanager-production
+                - siderolabs/nvidia-open-gpu-kernel-modules-lts
+                - siderolabs/nvidia-open-gpu-kernel-modules-production
+                - siderolabs/util-linux-tools"
+  "56774e0894c8a3a3a9834a2aea65f24163cacf9506abbcbdc3ba135eaca4953f":
+    schematic:
+      customization:
+      systemExtensions:
+          officialExtensions:
+              - siderolabs/gvisor
+              - siderolabs/intel-ucode
+              - siderolabs/iscsi-tools
+              - siderolabs/nvidia-container-toolkit-production
+              - siderolabs/nvidia-fabricmanager-production
+              - siderolabs/nvidia-open-gpu-kernel-modules-production
+              - siderolabs/util-linux-tools
+  "9ac1424dbdf4b964154a36780dbf2215bf17d2752cd0847fa3b81d7da761457f":
+    schematic:
+      customization:
+      systemExtensions:
+          officialExtensions:
+              - siderolabs/gvisor
+              - siderolabs/intel-ucode
+              - siderolabs/iscsi-tools
+              - siderolabs/nonfree-kmod-nvidia-production
+              - siderolabs/nvidia-container-toolkit-production
+              - siderolabs/nvidia-fabricmanager-production
+              - siderolabs/util-linux-tools
--- a/internal/setup/cluster-services/README.md
+++ b/internal/setup/cluster-services/README.md
@@ -0,0 +1,102 @@
+# Wild Cloud Cluster Services
+
+Creates a fully functional personal cloud infrastructure on a bare metal Kubernetes cluster that provides:
+
+1. **External access** to services via configured domain names (using ${DOMAIN})
+2. **Internal-only access** to admin interfaces (via internal.${DOMAIN} subdomains)
+3. **Secure traffic routing** with automatic TLS
+4. **Reliable networking** with proper load balancing
+
+## Service Management
+
+Wild Cloud uses a streamlined per-service setup approach:
+
+**Primary Command**: `wild-service-setup <service> [options]`
+- **Default**: Configure and deploy service using existing templates
+- **`--fetch`**: Fetch fresh templates before setup (for updates)
+- **`--no-deploy`**: Configure only, skip deployment (for planning)
+
+**Master Orchestrator**: `wild-setup-services`
+- Sets up all services in proper dependency order
+- Each service validates its prerequisites before deployment
+- Fail-fast approach with clear recovery instructions
+
+## Architecture
+
+```
+Internet → External DNS → MetalLB LoadBalancer → Traefik → Kubernetes Services
+                                    ↑
+                                 Internal DNS
+                                    ↑
+                              Internal Network
+```
+
+## Key Components
+
+- **[MetalLB](metallb/README.md)** - Provides load balancing for bare metal clusters
+- **[Traefik](traefik/README.md)** - Handles ingress traffic, TLS termination, and routing
+- **[cert-manager](cert-manager/README.md)** - Manages TLS certificates
+- **[CoreDNS](coredns/README.md)** - Provides DNS resolution for services
+- **[ExternalDNS](externaldns/README.md)** - Automatic DNS record management
+- **[Longhorn](longhorn/README.md)** - Distributed storage system for persistent volumes
+- **[NFS](nfs/README.md)** - Network file system for shared media storage (optional)
+- **[Kubernetes Dashboard](kubernetes-dashboard/README.md)** - Web UI for cluster management (accessible via https://dashboard.internal.${DOMAIN})
+- **[Docker Registry](docker-registry/README.md)** - Private container registry for custom images
+- **[Utils](utils/README.md)** - Cluster utilities and debugging tools
+
+## Common Usage Patterns
+
+### Complete Infrastructure Setup
+```bash
+# All services with fresh templates (recommended for first-time setup)
+wild-setup-services --fetch
+
+# All services using existing templates (fastest)
+wild-setup-services
+
+# Configure all services but don't deploy (for planning)
+wild-setup-services --no-deploy
+```
+
+### Individual Service Management
+```bash
+# Most common - reconfigure and deploy existing service
+wild-service-setup cert-manager
+
+# Get fresh templates and deploy (for updates)
+wild-service-setup cert-manager --fetch
+
+# Configure only, don't deploy (for planning)
+wild-service-setup cert-manager --no-deploy
+
+# Fresh templates + configure + deploy
+wild-service-setup cert-manager --fetch
+```
+
+### Service Dependencies
+Services are automatically deployed in dependency order:
+1. **metallb** → Load balancing foundation
+2. **traefik** → Ingress (requires metallb)
+3. **cert-manager** → TLS certificates (requires traefik)
+4. **externaldns** → DNS automation (requires cert-manager)
+5. **kubernetes-dashboard** → Admin UI (requires cert-manager)
+
+Each service validates its dependencies before deployment.
+
+## Idempotent Design
+
+All setup is designed to be idempotent and reliable:
+
+- **Atomic Operations**: Each service handles its complete lifecycle
+- **Dependency Validation**: Services check prerequisites before deployment
+- **Error Recovery**: Failed services can be individually fixed and re-run
+- **Safe Retries**: Operations can be repeated without harm
+- **Incremental Updates**: Configuration changes applied cleanly
+
+Example recovery from cert-manager failure:
+```bash
+# Fix the issue, then resume
+wild-service-setup cert-manager --fetch
+# Continue with remaining services
+wild-service-setup externaldns --fetch
+```
--- a/internal/setup/cluster-services/cert-manager/README.md
+++ b/internal/setup/cluster-services/cert-manager/README.md
--- a/internal/setup/cluster-services/cert-manager/install.sh
+++ b/internal/setup/cluster-services/cert-manager/install.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
+
+echo "🔧 === Setting up cert-manager ==="
+echo ""
+
+#######################
+# Dependencies
+#######################
+
+# Check Traefik dependency
+echo "🔍 Verifying Traefik is ready (required for cert-manager)..."
+kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
+    echo "⚠️  Traefik not ready, but continuing with cert-manager installation"
+    echo "💡 Note: cert-manager may not work properly without Traefik"
+}
+
+if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${CERT_MANAGER_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+# Note: DNS validation and Cloudflare token setup moved to configuration phase
+# The configuration should be set via: wild config set cluster.certManager.cloudflare.*
+
+########################
+# Kubernetes components
+########################
+
+echo "📦 Installing cert-manager components..."
+# Using stable URL for cert-manager installation
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
+  kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
+
+# Wait for cert-manager to be ready
+echo "⏳ Waiting for cert-manager to be ready..."
+kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
+kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
+kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
+
+# Create Cloudflare API token secret
+# Read token from Wild Central secrets file
+echo "🔐 Creating Cloudflare API token secret..."
+SECRETS_FILE="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}/secrets.yaml"
+CLOUDFLARE_API_TOKEN=$(yq '.cloudflare.token' "$SECRETS_FILE" 2>/dev/null)
+
+CLOUDFLARE_API_TOKEN=$(echo "$CLOUDFLARE_API_TOKEN")
+if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
+    echo "❌ ERROR: Cloudflare API token not found"
+    echo "💡 Please set: wild secret set cloudflare.token YOUR_TOKEN"
+    exit 1
+fi
+
+kubectl create secret generic cloudflare-api-token \
+  --namespace cert-manager \
+  --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+# Ensure webhook is fully operational
+echo "🔍 Verifying cert-manager webhook is fully operational..."
+until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
+    echo "⏳ Waiting for cert-manager webhook to register..."
+    sleep 5
+done
+
+# Configure cert-manager to use external DNS for challenge verification
+echo "🌐 Configuring cert-manager to use external DNS servers..."
+kubectl patch deployment cert-manager -n cert-manager --patch '
+spec:
+  template:
+    spec:
+      dnsPolicy: None
+      dnsConfig:
+        nameservers:
+          - "1.1.1.1"
+          - "8.8.8.8"
+        searches:
+          - cert-manager.svc.cluster.local
+          - svc.cluster.local
+          - cluster.local
+        options:
+          - name: ndots
+            value: "5"'
+
+# Wait for cert-manager to restart with new DNS config
+echo "⏳ Waiting for cert-manager to restart with new DNS configuration..."
+kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
+
+########################
+# Create issuers and certificates
+########################
+
+# Apply Let's Encrypt issuers and certificates using kustomize
+echo "🚀 Creating Let's Encrypt issuers and certificates..."
+kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
+
+# Wait for issuers to be ready
+echo "⏳ Waiting for Let's Encrypt issuers to be ready..."
+kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || echo "⚠️  Production issuer not ready, proceeding anyway..."
+kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || echo "⚠️  Staging issuer not ready, proceeding anyway..."
+
+# Give cert-manager a moment to process the certificates
+sleep 5
+
+######################################
+# Fix stuck certificates and cleanup
+######################################
+
+needs_restart=false
+
+# STEP 1: Fix certificates stuck with 404 errors
+echo "🔍 Checking for certificates with failed issuance attempts..."
+stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
+    jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
+
+if [ -n "$stuck_certs" ]; then
+    echo "⚠️  Found certificates stuck with non-existent orders, recreating them..."
+    echo "$stuck_certs" | while read ns name; do
+        echo "🔄 Recreating certificate $ns/$name..."
+        cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
+        kubectl delete certificate "$name" -n "$ns"
+        echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
+    done
+    needs_restart=true
+    sleep 5
+else
+    echo "✅ No certificates stuck with failed orders"
+fi
+
+# STEP 2: Clean up orphaned orders
+echo "🔍 Checking for orphaned ACME orders..."
+orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
+    grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
+    sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
+    sort -u || true)
+
+if [ -n "$orphaned_orders" ]; then
+    echo "⚠️  Found orphaned ACME orders from logs"
+    for order in $orphaned_orders; do
+        echo "🗑️  Deleting orphaned order: $order"
+        orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
+        if [ -n "$orders_found" ]; then
+            echo "$orders_found" | while read ns name rest; do
+                kubectl delete order "$name" -n "$ns" 2>/dev/null || true
+            done
+        fi
+    done
+    needs_restart=true
+else
+    echo "✅ No orphaned orders found in logs"
+fi
+
+# STEP 2.5: Check for Cloudflare DNS cleanup errors
+echo "🔍 Checking for Cloudflare DNS cleanup errors..."
+cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
+    grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
+
+if [ "$cloudflare_errors" -gt "0" ]; then
+    echo "⚠️  Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
+    echo "💡 Deleting stuck challenges and orders to allow fresh start"
+
+    # Delete all challenges and orders in cert-manager namespace
+    kubectl delete challenges --all -n cert-manager 2>/dev/null || true
+    kubectl delete orders --all -n cert-manager 2>/dev/null || true
+
+    needs_restart=true
+else
+    echo "✅ No Cloudflare DNS cleanup errors"
+fi
+
+# STEP 3: Single restart if anything needs cleaning
+if [ "$needs_restart" = true ]; then
+    echo "🔄 Restarting cert-manager to clear internal state..."
+    kubectl rollout restart deployment cert-manager -n cert-manager
+    kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
+    echo "⏳ Waiting for cert-manager to recreate fresh challenges..."
+    sleep 15
+else
+    echo "✅ No restart needed - cert-manager state is clean"
+fi
+
+#########################
+# Final checks
+#########################
+
+# Wait for the certificates to be issued with progress feedback
+echo "⏳ Waiting for wildcard certificates to be ready (this may take several minutes)..."
+
+# Function to wait for certificate with progress output
+wait_for_cert() {
+    local cert_name="$1"
+    local timeout=300
+    local elapsed=0
+
+    echo "  📜 Checking $cert_name..."
+
+    while [ $elapsed -lt $timeout ]; do
+        if kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q "True"; then
+            echo "  ✅ $cert_name is ready"
+            return 0
+        fi
+
+        # Show progress every 30 seconds
+        if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then
+            local status=$(kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "Waiting...")
+            echo "  ⏳ Still waiting for $cert_name... ($elapsed/${timeout}s) - $status"
+        fi
+
+        sleep 5
+        elapsed=$((elapsed + 5))
+    done
+
+    echo "  ⚠️  Timeout waiting for $cert_name (will continue anyway)"
+    return 1
+}
+
+wait_for_cert "wildcard-internal-wild-cloud"
+wait_for_cert "wildcard-wild-cloud"
+
+# Final health check
+echo "🔍 Performing final cert-manager health check..."
+failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
+if [ "$failed_certs" -gt 0 ]; then
+    echo "⚠️  Found $failed_certs certificates not in Ready state"
+    echo "💡 Check certificate status with: kubectl get certificates --all-namespaces"
+    echo "💡 Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
+else
+    echo "✅ All certificates are in Ready state"
+fi
+
+echo ""
+echo "✅ cert-manager setup complete!"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get certificates --all-namespaces"
+echo "  kubectl get clusterissuers"
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/cert-manager-for-reference.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/cert-manager-for-reference.yaml
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/internal-wildcard-certificate.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/internal-wildcard-certificate.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: wildcard-internal-wild-cloud
+  namespace: cert-manager
+spec:
+  secretName: wildcard-internal-wild-cloud-tls
+  dnsNames:
+    - "*.{{ .cloud.internalDomain }}"
+    - "{{ .cloud.internalDomain }}"
+  issuerRef:
+    name: letsencrypt-prod
+    kind: ClusterIssuer
+  duration: 2160h # 90 days
+  renewBefore: 360h # 15 days
+  privateKey:
+    algorithm: RSA
+    size: 2048
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/kustomization.yaml
@@ -0,0 +1,12 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- namespace.yaml
+- letsencrypt-staging-dns01.yaml
+- letsencrypt-prod-dns01.yaml
+- internal-wildcard-certificate.yaml
+- wildcard-certificate.yaml
+
+# Note: cert-manager.yaml contains the main installation manifests
+# but is applied separately via URL in the install script
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-prod-dns01.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-prod-dns01.yaml
@@ -0,0 +1,25 @@
+---
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-prod
+spec:
+  acme:
+    email: {{ .operator.email }}
+    privateKeySecretRef:
+      name: letsencrypt-prod
+    server: https://acme-v02.api.letsencrypt.org/directory
+    solvers:
+    # DNS-01 solver for wildcard certificates
+    - dns01:
+        cloudflare:
+          apiTokenSecretRef:
+            name: cloudflare-api-token
+            key: api-token
+      selector:
+        dnsZones:
+        - "{{ .cluster.certManager.cloudflare.domain }}"
+    # Keep the HTTP-01 solver for non-wildcard certificates
+    - http01:
+        ingress:
+          class: traefik
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-staging-dns01.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-staging-dns01.yaml
@@ -0,0 +1,25 @@
+---
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-staging
+spec:
+  acme:
+    email: {{ .operator.email }}
+    privateKeySecretRef:
+      name: letsencrypt-staging
+    server: https://acme-staging-v02.api.letsencrypt.org/directory
+    solvers:
+    # DNS-01 solver for wildcard certificates
+    - dns01:
+        cloudflare:
+          apiTokenSecretRef:
+            name: cloudflare-api-token
+            key: api-token
+      selector:
+        dnsZones:
+        - "{{ .cluster.certManager.cloudflare.domain }}"
+    # Keep the HTTP-01 solver for non-wildcard certificates
+    - http01:
+        ingress:
+          class: traefik
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/namespace.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: cert-manager
--- a/internal/setup/cluster-services/cert-manager/kustomize.template/wildcard-certificate.yaml
+++ b/internal/setup/cluster-services/cert-manager/kustomize.template/wildcard-certificate.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: wildcard-wild-cloud
+  namespace: cert-manager
+spec:
+  secretName: wildcard-wild-cloud-tls
+  dnsNames:
+    - "*.{{ .cloud.domain }}"
+    - "{{ .cloud.domain }}"
+  issuerRef:
+    name: letsencrypt-prod
+    kind: ClusterIssuer
+  duration: 2160h # 90 days
+  renewBefore: 360h # 15 days
+  privateKey:
+    algorithm: RSA
+    size: 2048
--- a/internal/setup/cluster-services/cert-manager/wild-manifest.yaml
+++ b/internal/setup/cluster-services/cert-manager/wild-manifest.yaml
@@ -0,0 +1,25 @@
+name: cert-manager
+description: X.509 certificate management for Kubernetes
+namespace: cert-manager
+category: infrastructure
+
+dependencies:
+  - traefik
+
+configReferences:
+  - cloud.domain
+  - cloud.baseDomain
+  - cloud.internalDomain
+  - operator.email
+
+serviceConfig:
+  cloudflareDomain:
+    path: cluster.certManager.cloudflare.domain
+    prompt: "Enter Cloudflare domain"
+    default: "{{ .cloud.baseDomain }}"
+    type: string
+  cloudflareZoneID:
+    path: cluster.certManager.cloudflare.zoneID
+    prompt: "Enter Cloudflare zone ID"
+    default: ""
+    type: string
--- a/internal/setup/cluster-services/common.sh
+++ b/internal/setup/cluster-services/common.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# Common functions for Wild Central service installation scripts
+
+# TODO: We should use this. :P
+
+# Ensure required environment variables are set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE environment variable is not set"
+    exit 1
+fi
+
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA environment variable is not set"
+    exit 1
+fi
+
+# Get the instance directory path
+get_instance_dir() {
+    echo "${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+}
+
+# Get the secrets file path
+get_secrets_file() {
+    echo "$(get_instance_dir)/secrets.yaml"
+}
+
+# Get the config file path
+get_config_file() {
+    echo "$(get_instance_dir)/config.yaml"
+}
+
+# Get a secret value from the secrets file
+# Usage: get_secret "path.to.secret"
+get_secret() {
+    local path="$1"
+    local secrets_file="$(get_secrets_file)"
+
+    if [ ! -f "$secrets_file" ]; then
+        echo ""
+        return 1
+    fi
+
+    local value=$(yq ".$path" "$secrets_file" 2>/dev/null)
+
+    # Remove quotes and return empty string if null
+    value=$(echo "$value" | tr -d '"')
+    if [ "$value" = "null" ]; then
+        echo ""
+        return 1
+    fi
+
+    echo "$value"
+}
+
+# Get a config value from the config file
+# Usage: get_config "path.to.config"
+get_config() {
+    local path="$1"
+    local config_file="$(get_config_file)"
+
+    if [ ! -f "$config_file" ]; then
+        echo ""
+        return 1
+    fi
+
+    local value=$(yq ".$path" "$config_file" 2>/dev/null)
+
+    # Remove quotes and return empty string if null
+    value=$(echo "$value" | tr -d '"')
+    if [ "$value" = "null" ]; then
+        echo ""
+        return 1
+    fi
+
+    echo "$value"
+}
+
+# Check if a secret exists and is not empty
+# Usage: require_secret "path.to.secret" "Friendly Name" "wild secret set command"
+require_secret() {
+    local path="$1"
+    local name="$2"
+    local set_command="$3"
+
+    local value=$(get_secret "$path")
+
+    if [ -z "$value" ]; then
+        echo "❌ ERROR: $name not found"
+        echo "💡 Please set: $set_command"
+        exit 1
+    fi
+
+    echo "$value"
+}
+
+# Check if a config value exists and is not empty
+# Usage: require_config "path.to.config" "Friendly Name" "wild config set command"
+require_config() {
+    local path="$1"
+    local name="$2"
+    local set_command="$3"
+
+    local value=$(get_config "$path")
+
+    if [ -z "$value" ]; then
+        echo "❌ ERROR: $name not found"
+        echo "💡 Please set: $set_command"
+        exit 1
+    fi
+
+    echo "$value"
+}
--- a/internal/setup/cluster-services/coredns/README.md
+++ b/internal/setup/cluster-services/coredns/README.md
@@ -0,0 +1,45 @@
+# CoreDNS
+
+- https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/
+- https://github.com/kubernetes/dns/blob/master/docs/specification.md
+- https://coredns.io/
+
+CoreDNS has the `kubernetes` plugin, so it returns all k8s service endpoints in well-known format.
+
+All services and pods are registered in CoreDNS.
+
+- <service-name>.<namespace>.svc.cluster.local
+- <service-name>.<namespace>
+- <service-name> (if in the same namespace)
+
+- <pod-ipv4-address>.<namespace>.pod.cluster.local
+- <pod-ipv4-address>.<service-name>.<namespace>.svc.cluster.local
+
+Any query for a resource in the `internal.$DOMAIN` domain will be given the IP of the Traefik proxy. We expose the CoreDNS server in the LAN via MetalLB just for this capability.
+
+## Default CoreDNS Configuration
+
+This is the default CoreDNS configuration, for reference:
+
+```txt
+.:53 {
+    errors
+    health { lameduck 5s }
+    ready
+    log . { class error }
+    prometheus :9153
+    kubernetes cluster.local in-addr.arpa ip6.arpa {
+        pods insecure
+        fallthrough in-addr.arpa ip6.arpa
+        ttl 30
+    }
+    forward . /etc/resolv.conf { max_concurrent 1000 }
+    cache 30 {
+        disable success cluster.local
+        disable denial cluster.local
+    }
+    loop
+    reload
+    loadbalance
+}
+```
--- a/internal/setup/cluster-services/coredns/install.sh
+++ b/internal/setup/cluster-services/coredns/install.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+COREDNS_DIR="${CLUSTER_SETUP_DIR}/coredns"
+
+echo "🔧 === Setting up CoreDNS ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled CoreDNS templates..."
+if [ ! -d "${COREDNS_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${COREDNS_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+# Apply the custom DNS override
+# TODO: Is this needed now that we are no longer on k3s?
+echo "🚀 Applying CoreDNS custom override configuration..."
+kubectl apply -f "${COREDNS_DIR}/kustomize/coredns-custom-config.yaml"
+
+echo "🔄 Restarting CoreDNS pods to apply changes..."
+kubectl rollout restart deployment/coredns -n kube-system
+echo "⏳ Waiting for CoreDNS rollout to complete..."
+kubectl rollout status deployment/coredns -n kube-system
+
+echo ""
+echo "✅ CoreDNS configured successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n kube-system -l k8s-app=kube-dns"
+echo "  kubectl get svc -n kube-system coredns"
+echo "  kubectl describe svc -n kube-system coredns"
+echo ""
+echo "📋 To view CoreDNS logs:"
+echo "  kubectl logs -n kube-system -l k8s-app=kube-dns -f"
--- a/internal/setup/cluster-services/coredns/kustomize.template/coredns-custom-config.yaml
+++ b/internal/setup/cluster-services/coredns/kustomize.template/coredns-custom-config.yaml
@@ -0,0 +1,28 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: coredns-custom
+  namespace: kube-system
+data:
+  # Custom server block for internal domains. All internal domains should
+  # resolve to the cluster proxy.
+  internal.server: |
+    {{ .cloud.internalDomain }} {
+      errors
+      cache 30
+      reload
+      template IN A {
+        match (.*)\.{{ .cloud.internalDomain | strings.ReplaceAll "." "\\." }}\.
+        answer "{{`{{ .Name }}`}} 60 IN A {{ .cluster.loadBalancerIp }}"
+      }
+      template IN AAAA {
+        match (.*)\.{{ .cloud.internalDomain | strings.ReplaceAll "." "\\." }}\.
+        rcode NXDOMAIN
+      }
+    }
+  # Custom override to set external resolvers.
+  external.override: |
+    forward . {{ .cloud.dns.externalResolver }} {
+      max_concurrent 1000
+    }
--- a/internal/setup/cluster-services/coredns/wild-manifest.yaml
+++ b/internal/setup/cluster-services/coredns/wild-manifest.yaml
@@ -0,0 +1,15 @@
+name: coredns
+description: DNS server for internal cluster DNS resolution
+namespace: kube-system
+category: infrastructure
+
+configReferences:
+  - cloud.internalDomain
+  - cluster.loadBalancerIp
+
+serviceConfig:
+  externalResolver:
+    path: cloud.dns.externalResolver
+    prompt: "Enter external DNS resolver"
+    default: "8.8.8.8"
+    type: string
--- a/internal/setup/cluster-services/docker-registry/README.md
+++ b/internal/setup/cluster-services/docker-registry/README.md
--- a/internal/setup/cluster-services/docker-registry/install.sh
+++ b/internal/setup/cluster-services/docker-registry/install.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+DOCKER_REGISTRY_DIR="${CLUSTER_SETUP_DIR}/docker-registry"
+
+echo "🔧 === Setting up Docker Registry ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled Docker Registry templates..."
+if [ ! -d "${DOCKER_REGISTRY_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${DOCKER_REGISTRY_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Deploying Docker Registry..."
+kubectl apply -k "${DOCKER_REGISTRY_DIR}/kustomize"
+
+echo "⏳ Waiting for Docker Registry to be ready..."
+kubectl wait --for=condition=available --timeout=300s deployment/docker-registry -n docker-registry
+
+echo ""
+echo "✅ Docker Registry installed successfully"
+echo ""
+echo "📊 Deployment status:"
+kubectl get pods -n docker-registry
+kubectl get services -n docker-registry
+echo ""
+echo "💡 To use the registry:"
+echo "  docker tag myimage registry.local/myimage"
+echo "  docker push registry.local/myimage"
--- a/internal/setup/cluster-services/docker-registry/kustomize.template/deployment.yaml
+++ b/internal/setup/cluster-services/docker-registry/kustomize.template/deployment.yaml
@@ -0,0 +1,36 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docker-registry
+  labels:
+    app: docker-registry    
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: docker-registry
+  strategy:
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: docker-registry
+    spec:
+      containers:
+        - image: registry:3.0.0
+          name: docker-registry
+          ports:
+            - containerPort: 5000
+              protocol: TCP
+          volumeMounts:
+            - mountPath: /var/lib/registry
+              name: docker-registry-storage
+              readOnly: false
+      volumes:
+        - name: docker-registry-storage
+          persistentVolumeClaim:
+            claimName: docker-registry-pvc
--- a/internal/setup/cluster-services/docker-registry/kustomize.template/ingress.yaml
+++ b/internal/setup/cluster-services/docker-registry/kustomize.template/ingress.yaml
@@ -0,0 +1,20 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: docker-registry
+spec:
+  rules:
+    - host: {{ .cloud.dockerRegistryHost }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: docker-registry
+                port:
+                  number: 5000
+  tls:
+    - hosts:
+        - {{ .cloud.dockerRegistryHost }}
+      secretName: wildcard-internal-wild-cloud-tls
--- a/internal/setup/cluster-services/docker-registry/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/docker-registry/kustomize.template/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: docker-registry
+labels:
+  - includeSelectors: true
+    pairs:
+      app: docker-registry
+      managedBy: wild-cloud
+resources:
+  - deployment.yaml
+  - ingress.yaml
+  - service.yaml
+  - namespace.yaml
+  - pvc.yaml
--- a/internal/setup/cluster-services/docker-registry/kustomize.template/namespace.yaml
+++ b/internal/setup/cluster-services/docker-registry/kustomize.template/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: docker-registry
--- a/internal/setup/cluster-services/docker-registry/kustomize.template/pvc.yaml
+++ b/internal/setup/cluster-services/docker-registry/kustomize.template/pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: docker-registry-pvc
+spec:
+  storageClassName: longhorn
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: {{ .cluster.dockerRegistry.storage }}
--- a/internal/setup/cluster-services/docker-registry/kustomize.template/service.yaml
+++ b/internal/setup/cluster-services/docker-registry/kustomize.template/service.yaml
@@ -0,0 +1,13 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: docker-registry
+  labels:
+    app: docker-registry
+spec:
+  ports:
+    - port: 5000
+      targetPort: 5000
+  selector:
+    app: docker-registry
--- a/internal/setup/cluster-services/docker-registry/wild-manifest.yaml
+++ b/internal/setup/cluster-services/docker-registry/wild-manifest.yaml
@@ -0,0 +1,20 @@
+name: docker-registry
+description: Private Docker image registry for cluster
+namespace: docker-registry
+category: infrastructure
+
+dependencies:
+  - traefik
+  - cert-manager
+
+serviceConfig:
+  registryHost:
+    path: cloud.dockerRegistryHost
+    prompt: "Enter Docker Registry hostname"
+    default: "registry.{{ .cloud.internalDomain }}"
+    type: string
+  storage:
+    path: cluster.dockerRegistry.storage
+    prompt: "Enter Docker Registry storage size"
+    default: "100Gi"
+    type: string
--- a/internal/setup/cluster-services/externaldns/README.md
+++ b/internal/setup/cluster-services/externaldns/README.md
@@ -0,0 +1,14 @@
+# External DNS
+
+See: https://github.com/kubernetes-sigs/external-dns
+
+ExternalDNS allows you to keep selected zones (via --domain-filter) synchronized with Ingresses and Services of type=LoadBalancer and nodes in various DNS providers.
+
+Currently, we are only configured to use CloudFlare.
+
+Docs: https://github.com/kubernetes-sigs/external-dns/blob/master/docs/tutorials/cloudflare.md
+
+Any Ingress that has metatdata.annotions with
+external-dns.alpha.kubernetes.io/hostname: `<something>.${DOMAIN}`
+
+will have Cloudflare records created by External DNS.
--- a/internal/setup/cluster-services/externaldns/install.sh
+++ b/internal/setup/cluster-services/externaldns/install.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+EXTERNALDNS_DIR="${CLUSTER_SETUP_DIR}/externaldns"
+
+echo "🌐 === Setting up ExternalDNS ==="
+echo ""
+
+# Check cert-manager dependency
+echo "🔍 Verifying cert-manager is ready (required for ExternalDNS)..."
+kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=60s 2>/dev/null && \
+kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=60s 2>/dev/null || {
+    echo "⚠️  cert-manager not ready, but continuing with ExternalDNS installation"
+    echo "💡 Note: ExternalDNS may not work properly without cert-manager"
+}
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled ExternalDNS templates..."
+if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${EXTERNALDNS_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+# Apply ExternalDNS manifests using kustomize
+echo "🚀 Deploying ExternalDNS..."
+kubectl apply -k ${EXTERNALDNS_DIR}/kustomize
+
+# Setup Cloudflare API token secret
+echo "🔐 Creating Cloudflare API token secret..."
+SECRETS_FILE="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}/secrets.yaml"
+CLOUDFLARE_API_TOKEN=$(yq '.cloudflare.token' "$SECRETS_FILE" 2>/dev/null | tr -d '"')
+
+if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
+    echo "❌ ERROR: Cloudflare API token not found."
+    echo "💡 Please set: wild secret set cloudflare.token YOUR_TOKEN"
+    exit 1
+fi
+kubectl create secret generic cloudflare-api-token \
+  --namespace externaldns \
+  --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+# Wait for ExternalDNS to be ready
+echo "⏳ Waiting for Cloudflare ExternalDNS to be ready..."
+kubectl rollout status deployment/external-dns -n externaldns --timeout=60s
+
+# echo "⏳ Waiting for CoreDNS ExternalDNS to be ready..."
+# kubectl rollout status deployment/external-dns-coredns -n externaldns --timeout=60s
+
+echo ""
+echo "✅ ExternalDNS installed successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n externaldns"
+echo "  kubectl logs -n externaldns -l app=external-dns -f"
+echo "  kubectl logs -n externaldns -l app=external-dns-coredns -f"
+echo ""
--- a/internal/setup/cluster-services/externaldns/kustomize.template/externaldns-cloudflare.yaml
+++ b/internal/setup/cluster-services/externaldns/kustomize.template/externaldns-cloudflare.yaml
@@ -0,0 +1,39 @@
+---
+# CloudFlare provider for ExternalDNS
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: external-dns
+  namespace: externaldns
+spec:
+  selector:
+    matchLabels:
+      app: external-dns
+  strategy:
+    type: Recreate
+  template:
+    metadata:
+      labels:
+        app: external-dns
+    spec:
+      serviceAccountName: external-dns
+      containers:
+        - name: external-dns
+          image: registry.k8s.io/external-dns/external-dns:v0.13.4
+          args:
+            - --source=service
+            - --source=ingress
+            - --txt-owner-id={{ .cluster.externalDns.ownerId }}
+            - --provider=cloudflare
+            - --domain-filter=payne.io
+            #- --exclude-domains=internal.${DOMAIN}
+            - --cloudflare-dns-records-per-page=5000
+            - --publish-internal-services
+            - --no-cloudflare-proxied
+            - --log-level=debug
+          env:
+            - name: CF_API_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: cloudflare-api-token
+                  key: api-token
--- a/internal/setup/cluster-services/externaldns/kustomize.template/externaldns-rbac.yaml
+++ b/internal/setup/cluster-services/externaldns/kustomize.template/externaldns-rbac.yaml
@@ -0,0 +1,35 @@
+---
+# Common RBAC resources for all ExternalDNS deployments
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: external-dns
+  namespace: externaldns
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: external-dns
+rules:
+  - apiGroups: [""]
+    resources: ["services", "endpoints", "pods"]
+    verbs: ["get", "watch", "list"]
+  - apiGroups: ["extensions", "networking.k8s.io"]
+    resources: ["ingresses"]
+    verbs: ["get", "watch", "list"]
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: external-dns-viewer
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: external-dns
+subjects:
+  - kind: ServiceAccount
+    name: external-dns
+    namespace: externaldns
--- a/internal/setup/cluster-services/externaldns/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/externaldns/kustomize.template/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- namespace.yaml
+- externaldns-rbac.yaml
+- externaldns-cloudflare.yaml
--- a/internal/setup/cluster-services/externaldns/kustomize.template/namespace.yaml
+++ b/internal/setup/cluster-services/externaldns/kustomize.template/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: externaldns
--- a/internal/setup/cluster-services/externaldns/wild-manifest.yaml
+++ b/internal/setup/cluster-services/externaldns/wild-manifest.yaml
@@ -0,0 +1,15 @@
+name: externaldns
+description: Automatically configures DNS records for services
+namespace: externaldns
+category: infrastructure
+
+configReferences:
+  - cloud.internalDomain
+  - cluster.name
+
+serviceConfig:
+  ownerId:
+    path: cluster.externalDns.ownerId
+    prompt: "Enter ExternalDNS owner ID (unique identifier for this cluster)"
+    default: "wild-cloud-{{ .cluster.name }}"
+    type: string
--- a/internal/setup/cluster-services/kubernetes-dashboard/README.md
+++ b/internal/setup/cluster-services/kubernetes-dashboard/README.md
--- a/internal/setup/cluster-services/kubernetes-dashboard/install.sh
+++ b/internal/setup/cluster-services/kubernetes-dashboard/install.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+KUBERNETES_DASHBOARD_DIR="${CLUSTER_SETUP_DIR}/kubernetes-dashboard"
+
+echo "🎮 === Setting up Kubernetes Dashboard ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled Dashboard templates..."
+if [ ! -d "${KUBERNETES_DASHBOARD_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${KUBERNETES_DASHBOARD_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+NAMESPACE="kubernetes-dashboard"
+
+# Apply the official dashboard installation
+echo "🚀 Installing Kubernetes Dashboard core components..."
+kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
+
+# Wait for cert-manager certificates to be ready
+echo "🔐 Waiting for cert-manager certificates to be ready..."
+kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || echo "⚠️  Warning: Internal wildcard certificate not ready yet"
+kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || echo "⚠️  Warning: Wildcard certificate not ready yet"
+
+# Copying cert-manager secrets to the dashboard namespace (if available)
+echo "📋 Copying cert-manager secrets to dashboard namespace..."
+if kubectl get secret wildcard-internal-wild-cloud-tls -n cert-manager >/dev/null 2>&1; then
+    kubectl get secret wildcard-internal-wild-cloud-tls -n cert-manager -o yaml | \
+        sed "s/namespace: cert-manager/namespace: ${NAMESPACE}/" | \
+        kubectl apply -f -
+else
+    echo "⚠️  Warning: wildcard-internal-wild-cloud-tls secret not yet available"
+fi
+
+if kubectl get secret wildcard-wild-cloud-tls -n cert-manager >/dev/null 2>&1; then
+    kubectl get secret wildcard-wild-cloud-tls -n cert-manager -o yaml | \
+        sed "s/namespace: cert-manager/namespace: ${NAMESPACE}/" | \
+        kubectl apply -f -
+else
+    echo "⚠️  Warning: wildcard-wild-cloud-tls secret not yet available"
+fi
+
+# Apply dashboard customizations using kustomize
+echo "🔧 Applying dashboard customizations..."
+kubectl apply -k "${KUBERNETES_DASHBOARD_DIR}/kustomize"
+
+# Restart CoreDNS to pick up the changes
+# echo "🔄 Restarting CoreDNS to pick up DNS changes..."
+# kubectl delete pods -n kube-system -l k8s-app=kube-dns
+
+# Wait for dashboard to be ready
+echo "⏳ Waiting for Kubernetes Dashboard to be ready..."
+kubectl rollout status deployment/kubernetes-dashboard -n $NAMESPACE --timeout=60s
+
+echo ""
+echo "✅ Kubernetes Dashboard installed successfully"
+echo ""
+# INTERNAL_DOMAIN should be available in environment (set from config before deployment)
+if [ -n "${INTERNAL_DOMAIN}" ]; then
+    echo "🌐 Access the dashboard at: https://dashboard.${INTERNAL_DOMAIN}"
+else
+    echo "💡 Access the dashboard via the configured internal domain"
+fi
+echo ""
+echo "💡 To get the authentication token:"
+echo "  kubectl create token admin-user -n kubernetes-dashboard"
+echo ""
--- a/internal/setup/cluster-services/kubernetes-dashboard/kustomize.template/dashboard-admin-rbac.yaml
+++ b/internal/setup/cluster-services/kubernetes-dashboard/kustomize.template/dashboard-admin-rbac.yaml
@@ -0,0 +1,32 @@
+---
+# Service Account and RBAC for Dashboard admin access
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: dashboard-admin
+  namespace: kubernetes-dashboard
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: dashboard-admin
+subjects:
+  - kind: ServiceAccount
+    name: dashboard-admin
+    namespace: kubernetes-dashboard
+roleRef:
+  kind: ClusterRole
+  name: cluster-admin
+  apiGroup: rbac.authorization.k8s.io
+
+---
+# Token for dashboard-admin
+apiVersion: v1
+kind: Secret
+metadata:
+  name: dashboard-admin-token
+  namespace: kubernetes-dashboard
+  annotations:
+    kubernetes.io/service-account.name: dashboard-admin
+type: kubernetes.io/service-account-token
--- a/internal/setup/cluster-services/kubernetes-dashboard/kustomize.template/dashboard-kube-system.yaml
+++ b/internal/setup/cluster-services/kubernetes-dashboard/kustomize.template/dashboard-kube-system.yaml
@@ -0,0 +1,84 @@
+---
+# Internal-only middleware
+apiVersion: traefik.io/v1alpha1
+kind: Middleware
+metadata:
+  name: internal-only
+  namespace: kubernetes-dashboard
+spec:
+  ipWhiteList:
+    # Restrict to local private network ranges
+    sourceRange:
+      - 127.0.0.1/32 # localhost
+      - 10.0.0.0/8 # Private network
+      - 172.16.0.0/12 # Private network
+      - 192.168.0.0/16 # Private network
+
+---
+# HTTPS redirect middleware
+apiVersion: traefik.io/v1alpha1
+kind: Middleware
+metadata:
+  name: dashboard-redirect-scheme
+  namespace: kubernetes-dashboard
+spec:
+  redirectScheme:
+    scheme: https
+    permanent: true
+
+---
+# IngressRoute for Dashboard
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: kubernetes-dashboard-https
+  namespace: kubernetes-dashboard
+spec:
+  entryPoints:
+    - websecure
+  routes:
+    - match: Host(`dashboard.{{ .cloud.internalDomain }}`)
+      kind: Rule
+      middlewares:
+        - name: internal-only
+          namespace: kubernetes-dashboard
+      services:
+        - name: kubernetes-dashboard
+          port: 443
+          serversTransport: dashboard-transport
+  tls:
+    secretName: wildcard-internal-wild-cloud-tls
+
+---
+# HTTP to HTTPS redirect.
+# FIXME: Is this needed?
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: kubernetes-dashboard-http
+  namespace: kubernetes-dashboard
+spec:
+  entryPoints:
+    - web
+  routes:
+    - match: Host(`dashboard.{{ .cloud.internalDomain }}`)
+      kind: Rule
+      middlewares:
+        - name: dashboard-redirect-scheme
+          namespace: kubernetes-dashboard
+      services:
+        - name: kubernetes-dashboard
+          port: 443
+          serversTransport: dashboard-transport
+
+---
+# ServersTransport for HTTPS backend with skip verify.
+# FIXME: Is this needed?
+apiVersion: traefik.io/v1alpha1
+kind: ServersTransport
+metadata:
+  name: dashboard-transport
+  namespace: kubernetes-dashboard
+spec:
+  insecureSkipVerify: true
+  serverName: dashboard.{{ .cloud.internalDomain }}
--- a/internal/setup/cluster-services/kubernetes-dashboard/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/kubernetes-dashboard/kustomize.template/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- dashboard-admin-rbac.yaml
+- dashboard-kube-system.yaml
--- a/internal/setup/cluster-services/kubernetes-dashboard/wild-manifest.yaml
+++ b/internal/setup/cluster-services/kubernetes-dashboard/wild-manifest.yaml
@@ -0,0 +1,11 @@
+name: kubernetes-dashboard
+description: Web-based Kubernetes user interface
+namespace: kubernetes-dashboard
+category: infrastructure
+
+dependencies:
+  - traefik
+  - cert-manager
+
+configReferences:
+  - cloud.internalDomain
--- a/internal/setup/cluster-services/longhorn/README.md
+++ b/internal/setup/cluster-services/longhorn/README.md
@@ -0,0 +1,20 @@
+# Longhorn Storage
+
+See: [Longhorn Docs v 1.8.1](https://longhorn.io/docs/1.8.1/deploy/install/install-with-kubectl/)
+
+## Installation Notes
+
+- Manifest copied from https://raw.githubusercontent.com/longhorn/longhorn/v1.8.1/deploy/longhorn.yaml
+- Using kustomize to apply custom configuration (see `kustomization.yaml`)
+
+## Important Settings
+
+- **Number of Replicas**: Set to 1 (default is 3) to accommodate smaller clusters
+  - This avoids "degraded" volumes when fewer than 3 nodes are available
+  - For production with 3+ nodes, consider changing back to 3 for better availability
+
+## Common Operations
+
+- View volumes: `kubectl get volumes.longhorn.io -n longhorn-system`
+- Check volume status: `kubectl describe volumes.longhorn.io <volume-name> -n longhorn-system`
+- Access Longhorn UI: Set up port-forwarding with `kubectl -n longhorn-system port-forward service/longhorn-frontend 8080:80`
--- a/internal/setup/cluster-services/longhorn/install.sh
+++ b/internal/setup/cluster-services/longhorn/install.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+LONGHORN_DIR="${CLUSTER_SETUP_DIR}/longhorn"
+
+echo "🔧 === Setting up Longhorn ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled Longhorn templates..."
+if [ ! -d "${LONGHORN_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${LONGHORN_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Deploying Longhorn..."
+kubectl apply -k ${LONGHORN_DIR}/kustomize/
+
+echo "⏳ Waiting for Longhorn to be ready..."
+kubectl wait --for=condition=available --timeout=300s deployment/longhorn-driver-deployer -n longhorn-system || true
+
+echo ""
+echo "✅ Longhorn installed successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n longhorn-system"
+echo "  kubectl get storageclass"
+echo ""
+echo "🌐 To access the Longhorn UI:"
+echo "  kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80"
--- a/internal/setup/cluster-services/longhorn/kustomize.template/ingress.yaml
+++ b/internal/setup/cluster-services/longhorn/kustomize.template/ingress.yaml
@@ -0,0 +1,21 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: longhorn-ingress
+  namespace: longhorn-system
+spec:
+  rules:
+    - host: "longhorn.{{ .cloud.internalDomain }}"
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: longhorn-frontend
+                port:
+                  number: 80
+  tls:
+    - secretName: wildcard-internal-wild-cloud-tls
+      hosts:
+        - "longhorn.{{ .cloud.internalDomain }}"
--- a/internal/setup/cluster-services/longhorn/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/longhorn/kustomize.template/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - longhorn.yaml
+  - ingress.yaml
--- a/internal/setup/cluster-services/longhorn/kustomize.template/longhorn.yaml
+++ b/internal/setup/cluster-services/longhorn/kustomize.template/longhorn.yaml
--- a/internal/setup/cluster-services/longhorn/wild-manifest.yaml
+++ b/internal/setup/cluster-services/longhorn/wild-manifest.yaml
@@ -0,0 +1,7 @@
+name: longhorn
+description: Cloud-native distributed block storage for Kubernetes
+namespace: longhorn-system
+category: infrastructure
+
+dependencies:
+  - traefik
--- a/internal/setup/cluster-services/metallb/README.md
+++ b/internal/setup/cluster-services/metallb/README.md
--- a/internal/setup/cluster-services/metallb/install.sh
+++ b/internal/setup/cluster-services/metallb/install.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+METALLB_DIR="${CLUSTER_SETUP_DIR}/metallb"
+
+echo "🔧 === Setting up MetalLB ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled MetalLB templates..."
+if [ ! -d "${METALLB_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${METALLB_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Deploying MetalLB installation..."
+kubectl apply -k ${METALLB_DIR}/kustomize/installation
+
+echo "⏳ Waiting for MetalLB controller to be ready..."
+kubectl wait --for=condition=Available deployment/controller -n metallb-system --timeout=60s
+echo "⏳ Extra buffer for webhook initialization..."
+sleep 10
+
+echo "⚙️  Applying MetalLB configuration..."
+kubectl apply -k ${METALLB_DIR}/kustomize/configuration
+
+echo ""
+echo "✅ MetalLB installed and configured successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n metallb-system"
+echo "  kubectl get ipaddresspools.metallb.io -n metallb-system"
+echo ""
+echo "🌐 MetalLB will now provide LoadBalancer IPs for your services"
--- a/internal/setup/cluster-services/metallb/kustomize.template/configuration/kustomization.yaml
+++ b/internal/setup/cluster-services/metallb/kustomize.template/configuration/kustomization.yaml
@@ -0,0 +1,3 @@
+namespace: metallb-system
+resources:
+  - pool.yaml
--- a/internal/setup/cluster-services/metallb/kustomize.template/configuration/pool.yaml
+++ b/internal/setup/cluster-services/metallb/kustomize.template/configuration/pool.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: metallb.io/v1beta1
+kind: IPAddressPool
+metadata:
+  name: first-pool
+  namespace: metallb-system
+spec:
+  addresses:
+  - {{ .cluster.ipAddressPool }}
+
+---
+apiVersion: metallb.io/v1beta1
+kind: L2Advertisement
+metadata:
+  name: l2-advertisement
+  namespace: metallb-system
+spec:
+  ipAddressPools:
+  - first-pool
--- a/internal/setup/cluster-services/metallb/kustomize.template/installation/kustomization.yaml
+++ b/internal/setup/cluster-services/metallb/kustomize.template/installation/kustomization.yaml
@@ -0,0 +1,3 @@
+namespace: metallb-system
+resources:
+  - github.com/metallb/metallb/config/native?ref=v0.15.0
--- a/internal/setup/cluster-services/metallb/wild-manifest.yaml
+++ b/internal/setup/cluster-services/metallb/wild-manifest.yaml
@@ -0,0 +1,19 @@
+name: metallb
+description: Bare metal load-balancer for Kubernetes
+namespace: metallb-system
+category: infrastructure
+
+configReferences:
+  - cluster.name
+
+serviceConfig:
+  ipRange:
+    path: cluster.ipAddressPool
+    prompt: "Enter IP range for MetalLB (e.g., 192.168.1.240-192.168.1.250)"
+    default: "192.168.1.240-192.168.1.250"
+    type: string
+  loadBalancerIp:
+    path: cluster.loadBalancerIp
+    prompt: "Enter primary load balancer IP"
+    default: "192.168.1.240"
+    type: string
--- a/internal/setup/cluster-services/nfs/README.md
+++ b/internal/setup/cluster-services/nfs/README.md
@@ -0,0 +1,60 @@
+# NFS Setup (Optional)
+
+The infrastructure supports optional NFS (Network File System) for shared media storage across the cluster. If your config.yaml contains the `cloud.nfs` section, the NFS server will be set up automatically.
+
+## Host Setup
+
+First, set up the NFS server on your chosen host.
+
+```bash
+./setup-nfs-host.sh <host> <media-path>
+```
+
+Example:
+
+```bash
+./setup-nfs-host.sh box-01 /srv/nfs
+```
+
+## Cluster Integration
+
+Add to your `config.yaml`:
+
+```yaml
+cloud:
+  nfs:
+    host: box-01
+    mediaPath: /srv/nfs
+    storageCapacity: 250Gi # Max size for PersistentVolume
+```
+
+And now you can run the nfs cluster setup:
+
+```bash
+setup/setup-nfs-host.sh
+```
+
+## Features
+
+- Automatic IP detection - Uses network IP even when hostname resolves to localhost
+- Cluster-wide access - Any pod can mount the NFS share regardless of node placement
+- Configurable capacity - Set PersistentVolume size via `NFS_STORAGE_CAPACITY`
+- ReadWriteMany - Multiple pods can simultaneously access the same storage
+
+## Usage
+
+Applications can use NFS storage by setting `storageClassName: nfs` in their PVCs:
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: media-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: nfs
+  resources:
+    requests:
+      storage: 100Gi
+```
--- a/internal/setup/cluster-services/nfs/install.sh
+++ b/internal/setup/cluster-services/nfs/install.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CONFIG_FILE="${INSTANCE_DIR}/config.yaml"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+NFS_DIR="${CLUSTER_SETUP_DIR}/nfs"
+
+echo "💾 === Registering NFS Server with Kubernetes Cluster ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled NFS templates..."
+if [ ! -d "${NFS_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${NFS_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+NFS_HOST="$(yq '.cloud.nfs.host' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
+NFS_MEDIA_PATH="$(yq '.cloud.nfs.mediaPath' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
+NFS_STORAGE_CAPACITY="$(yq '.cloud.nfs.storageCapacity' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
+
+echo "📋 NFS Configuration:"
+echo "  Host: ${NFS_HOST}"
+echo "  Media path: ${NFS_MEDIA_PATH}"
+echo "  Storage capacity: ${NFS_STORAGE_CAPACITY}"
+echo ""
+
+# Validate required config values
+if [ -z "${NFS_HOST}" ] || [ "${NFS_HOST}" = "null" ]; then
+    echo "❌ ERROR: cloud.nfs.host not set in config"
+    exit 1
+fi
+if [ -z "${NFS_MEDIA_PATH}" ] || [ "${NFS_MEDIA_PATH}" = "null" ]; then
+    echo "❌ ERROR: cloud.nfs.mediaPath not set in config"
+    exit 1
+fi
+if [ -z "${NFS_STORAGE_CAPACITY}" ] || [ "${NFS_STORAGE_CAPACITY}" = "null" ]; then
+    echo "❌ ERROR: cloud.nfs.storageCapacity not set in config"
+    exit 1
+fi
+
+# Function to resolve NFS host to IP
+resolve_nfs_host() {
+    echo "🌐 Resolving NFS host: ${NFS_HOST}"
+    if [[ "${NFS_HOST}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+        # NFS_HOST is already an IP address
+        NFS_HOST_IP="${NFS_HOST}"
+        echo "  ℹ️  Host is already an IP address"
+    else
+        # Resolve hostname to IP
+        echo "  🔍 Looking up hostname..."
+        NFS_HOST_IP=$(getent hosts "${NFS_HOST}" 2>/dev/null | awk '{print $1}' | head -n1 || true)
+        echo "  📍 Resolved to: ${NFS_HOST_IP}"
+        if [[ -z "${NFS_HOST_IP}" ]]; then
+            echo "❌ ERROR: Unable to resolve hostname ${NFS_HOST} to IP address"
+            echo "💡 Make sure ${NFS_HOST} is resolvable from this cluster"
+            exit 1
+        fi
+
+        # Check if resolved IP is localhost - auto-detect network IP instead
+        if [[ "${NFS_HOST_IP}" =~ ^127\. ]]; then
+            echo "⚠️  Warning: ${NFS_HOST} resolves to localhost (${NFS_HOST_IP})"
+            echo "🔍 Auto-detecting network IP for cluster access..."
+
+            # Try to find the primary network interface IP (exclude docker/k8s networks)
+            local network_ip=$(ip route get 8.8.8.8 | grep -oP 'src \K\S+' 2>/dev/null)
+
+            if [[ -n "${network_ip}" && ! "${network_ip}" =~ ^127\. ]]; then
+                echo "✅ Using detected network IP: ${network_ip}"
+                NFS_HOST_IP="${network_ip}"
+            else
+                echo "❌ Could not auto-detect network IP. Available IPs:"
+                ip addr show | grep "inet " | grep -v "127.0.0.1" | grep -v "10.42" | grep -v "172." | awk '{print "  " $2}' | cut -d/ -f1
+                echo "💡 Please set NFS_HOST to the correct IP address manually."
+                exit 1
+            fi
+        fi
+    fi
+
+    echo "🌐 NFS server IP: ${NFS_HOST_IP}"
+    export NFS_HOST_IP
+}
+
+# Function to test NFS accessibility
+test_nfs_accessibility() {
+    echo ""
+    echo "🔍 Testing NFS accessibility from cluster..."
+
+    # Check if showmount is available
+    if ! command -v showmount >/dev/null 2>&1; then
+        echo "📦 Installing NFS client tools..."
+        if command -v apt-get >/dev/null 2>&1; then
+            sudo apt-get update && sudo apt-get install -y nfs-common
+        elif command -v yum >/dev/null 2>&1; then
+            sudo yum install -y nfs-utils
+        elif command -v dnf >/dev/null 2>&1; then
+            sudo dnf install -y nfs-utils
+        else
+            echo "⚠️ Warning: Unable to install NFS client tools. Skipping accessibility test."
+            return 0
+        fi
+    fi
+
+    # Test if we can reach the NFS server
+    echo "🌐 Testing connection to NFS server..."
+    if timeout 10 showmount -e "${NFS_HOST_IP}" >/dev/null 2>&1; then
+        echo "✅ NFS server is accessible"
+        echo "📋 Available exports:"
+        showmount -e "${NFS_HOST_IP}"
+    else
+        echo "❌ Cannot connect to NFS server at ${NFS_HOST_IP}"
+        echo "💡 Make sure:"
+        echo "  1. NFS server is running on ${NFS_HOST}"
+        echo "  2. Network connectivity exists between cluster and NFS host"
+        echo "  3. Firewall allows NFS traffic (port 2049)"
+        exit 1
+    fi
+
+    # Test specific export
+    if showmount -e "${NFS_HOST_IP}" | grep -q "${NFS_MEDIA_PATH}"; then
+        echo "✅ Media path ${NFS_MEDIA_PATH} is exported"
+    else
+        echo "❌ Media path ${NFS_MEDIA_PATH} is not found in exports"
+        echo "📋 Available exports:"
+        showmount -e "${NFS_HOST_IP}"
+        echo ""
+        echo "💡 Run setup-nfs-host.sh on ${NFS_HOST} to configure the export"
+        exit 1
+    fi
+}
+
+# Function to create test mount
+test_nfs_mount() {
+    echo ""
+    echo "🔧 Testing NFS mount functionality..."
+
+    local test_mount="/tmp/nfs-test-$$"
+    mkdir -p "${test_mount}"
+
+    # Try to mount the NFS export
+    if timeout 30 sudo mount -t nfs4 "${NFS_HOST_IP}:${NFS_MEDIA_PATH}" "${test_mount}"; then
+        echo "✅ NFS mount successful"
+
+        # Test read access
+        if ls "${test_mount}" >/dev/null 2>&1; then
+            echo "✅ NFS read access working"
+        else
+            echo "❌ NFS read access failed"
+        fi
+
+        # Unmount
+        sudo umount "${test_mount}" || echo "⚠️  Warning: Failed to unmount test directory"
+    else
+        echo "❌ NFS mount failed"
+        echo "💡 Check NFS server configuration and network connectivity"
+        exit 1
+    fi
+
+    # Clean up
+    rmdir "${test_mount}" 2>/dev/null || true
+}
+
+# Function to create Kubernetes resources
+create_k8s_resources() {
+    echo ""
+    echo "🚀 Creating Kubernetes NFS resources..."
+
+    # Apply the NFS Kubernetes manifests using kustomize (templates already processed)
+    echo "📦 Applying NFS manifests..."
+    kubectl apply -k "${NFS_DIR}/kustomize"
+
+    echo "✅ NFS PersistentVolume and StorageClass created"
+
+    # Verify resources were created
+    echo "🔍 Verifying Kubernetes resources..."
+    if kubectl get storageclass nfs >/dev/null 2>&1; then
+        echo "✅ StorageClass 'nfs' created"
+    else
+        echo "❌ StorageClass 'nfs' not found"
+        exit 1
+    fi
+
+    if kubectl get pv nfs-media-pv >/dev/null 2>&1; then
+        echo "✅ PersistentVolume 'nfs-media-pv' created"
+        kubectl get pv nfs-media-pv
+    else
+        echo "❌ PersistentVolume 'nfs-media-pv' not found"
+        exit 1
+    fi
+}
+
+# Function to show usage instructions
+show_usage_instructions() {
+    echo ""
+    echo "✅ === NFS Kubernetes Setup Complete ==="
+    echo ""
+    echo "💾 NFS server ${NFS_HOST} (${NFS_HOST_IP}) has been registered with the cluster"
+    echo ""
+    echo "📋 Kubernetes resources created:"
+    echo "  - StorageClass: nfs"
+    echo "  - PersistentVolume: nfs-media-pv (${NFS_STORAGE_CAPACITY}, ReadWriteMany)"
+    echo ""
+    echo "💡 To use NFS storage in your applications:"
+    echo "  1. Set storageClassName: nfs in your PVC"
+    echo "  2. Use accessMode: ReadWriteMany for shared access"
+    echo ""
+    echo "📝 Example PVC:"
+    echo "---"
+    echo "apiVersion: v1"
+    echo "kind: PersistentVolumeClaim"
+    echo "metadata:"
+    echo "  name: my-nfs-pvc"
+    echo "spec:"
+    echo "  accessModes:"
+    echo "    - ReadWriteMany"
+    echo "  storageClassName: nfs"
+    echo "  resources:"
+    echo "    requests:"
+    echo "      storage: 10Gi"
+    echo ""
+}
+
+# Main execution
+main() {
+    resolve_nfs_host
+    test_nfs_accessibility
+    test_nfs_mount
+    create_k8s_resources
+    show_usage_instructions
+}
+
+# Run main function
+echo "🔧 Starting NFS setup process..."
+main "$@"
--- a/internal/setup/cluster-services/nfs/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/nfs/kustomize.template/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - persistent-volume.yaml
+  - storage-class.yaml
--- a/internal/setup/cluster-services/nfs/kustomize.template/persistent-volume.yaml
+++ b/internal/setup/cluster-services/nfs/kustomize.template/persistent-volume.yaml
@@ -0,0 +1,23 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: nfs-media-pv
+  labels:
+    storage: nfs-media
+spec:
+  capacity:
+    storage: {{ .cloud.nfs.storageCapacity }}
+  accessModes:
+    - ReadWriteMany
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: nfs
+  nfs:
+    server: {{ .cloud.nfs.host }}
+    path: {{ .cloud.nfs.mediaPath }}
+  mountOptions:
+    - nfsvers=4.1
+    - rsize=1048576
+    - wsize=1048576
+    - hard
+    - intr
+    - timeo=600
--- a/internal/setup/cluster-services/nfs/kustomize.template/storage-class.yaml
+++ b/internal/setup/cluster-services/nfs/kustomize.template/storage-class.yaml
@@ -0,0 +1,10 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: nfs
+provisioner: nfs
+parameters:
+  server: {{ .cloud.nfs.host }}
+  path: {{ .cloud.nfs.mediaPath }}
+reclaimPolicy: Retain
+allowVolumeExpansion: true
--- a/internal/setup/cluster-services/nfs/setup-nfs-host.sh
+++ b/internal/setup/cluster-services/nfs/setup-nfs-host.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Navigate to script directory
+SCRIPT_PATH="$(realpath "${BASH_SOURCE[0]}")"
+SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+
+usage() {
+    echo "Usage: setup-nfs-host.sh [server] [media-path] [options]"
+    echo ""
+    echo "Set up NFS server on the specified host."
+    echo ""
+    echo "Examples:"
+    echo "  setup-nfs-host.sh box-01 /data/media"
+    echo ""
+    echo "Options:"
+    echo "  -h, --help  Show this help message"
+    echo "  -e, --export-options  Set the NFS export options"
+
+}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -e|--export-options)
+            if [[ -z "$2" ]]; then
+                echo "Error: --export-options requires a value"
+                exit 1
+            else
+                NFS_EXPORT_OPTIONS="$2"
+            fi
+            shift 2
+            ;;
+        -*)
+            echo "Unknown option $1"
+            usage
+            exit 1
+            ;;
+        *)
+            # First non-option argument is server
+            if [[ -z "$NFS_HOST" ]]; then
+                export NFS_HOST="$1"
+            # Second non-option argument is media path
+            elif [[ -z "$NFS_MEDIA_PATH" ]]; then
+                export NFS_MEDIA_PATH="$1"
+            else
+                echo "Too many arguments"
+                usage
+                exit 1
+            fi
+            shift
+            ;;
+    esac
+done
+
+echo "Setting up NFS server on this host..."
+
+# Check if required NFS variables are configured
+if [[ -z "${NFS_HOST}" ]]; then
+    echo "NFS_HOST not set. Please set NFS_HOST=<hostname> in your environment"
+    echo "Example: export NFS_HOST=box-01"
+    exit 1
+fi
+
+# Ensure NFS_MEDIA_PATH is explicitly set
+if [[ -z "${NFS_MEDIA_PATH}" ]]; then
+    echo "Error: NFS_MEDIA_PATH not set. Please set it in your environment"
+    echo "Example: export NFS_MEDIA_PATH=/data/media"
+    exit 1
+fi
+
+# Set default for NFS_EXPORT_OPTIONS if not already set
+if [[ -z "${NFS_EXPORT_OPTIONS}" ]]; then
+    export NFS_EXPORT_OPTIONS="*(rw,sync,no_subtree_check,no_root_squash)"
+    echo "Using default NFS_EXPORT_OPTIONS: ${NFS_EXPORT_OPTIONS}"
+fi
+
+echo "Target NFS host: ${NFS_HOST}"
+echo "Media path: ${NFS_MEDIA_PATH}"
+echo "Export options: ${NFS_EXPORT_OPTIONS}"
+
+# Function to check if we're running on the correct host
+check_host() {
+    local current_hostname=$(hostname)
+    if [[ "${current_hostname}" != "${NFS_HOST}" ]]; then
+        echo "Warning: Current host (${current_hostname}) differs from NFS_HOST (${NFS_HOST})"
+        echo "This script should be run on ${NFS_HOST}"
+        read -p "Continue anyway? (y/N): " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            exit 1
+        fi
+    fi
+}
+
+# Function to install NFS server and SMB/CIFS
+install_nfs_server() {
+    echo "Installing NFS server and SMB/CIFS packages..."
+    
+    # Detect package manager and install NFS server + Samba
+    if command -v apt-get >/dev/null 2>&1; then
+        # Debian/Ubuntu
+        sudo apt-get update
+        sudo apt-get install -y nfs-kernel-server nfs-common samba samba-common-bin
+    elif command -v yum >/dev/null 2>&1; then
+        # RHEL/CentOS
+        sudo yum install -y nfs-utils samba samba-client
+    elif command -v dnf >/dev/null 2>&1; then
+        # Fedora
+        sudo dnf install -y nfs-utils samba samba-client
+    else
+        echo "Error: Unable to detect package manager. Please install NFS server and Samba manually."
+        exit 1
+    fi
+}
+
+# Function to create media directory
+create_media_directory() {
+    echo "Creating media directory: ${NFS_MEDIA_PATH}"
+    
+    # Create directory if it doesn't exist
+    sudo mkdir -p "${NFS_MEDIA_PATH}"
+    
+    # Set appropriate permissions
+    # Using 755 for directory, allowing read/execute for all, write for owner
+    sudo chmod 755 "${NFS_MEDIA_PATH}"
+    
+    echo "Media directory created with appropriate permissions"
+    echo "Directory info:"
+    ls -la "${NFS_MEDIA_PATH}/"
+}
+
+# Function to configure NFS exports
+configure_nfs_exports() {
+    echo "Configuring NFS exports..."
+    
+    local export_line="${NFS_MEDIA_PATH} ${NFS_EXPORT_OPTIONS}"
+    local exports_file="/etc/exports"
+    
+    # Backup existing exports file
+    sudo cp "${exports_file}" "${exports_file}.backup.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
+    
+    # Check if export already exists
+    if sudo grep -q "^${NFS_MEDIA_PATH}" "${exports_file}" 2>/dev/null; then
+        echo "Export for ${NFS_MEDIA_PATH} already exists, updating..."
+        sudo sed -i "s|^${NFS_MEDIA_PATH}.*|${export_line}|" "${exports_file}"
+    else
+        echo "Adding new export for ${NFS_MEDIA_PATH}..."
+        echo "${export_line}" | sudo tee -a "${exports_file}"
+    fi
+    
+    # Export the filesystems
+    sudo exportfs -rav
+    
+    echo "NFS exports configured:"
+    sudo exportfs -v
+}
+
+# Function to start and enable NFS services
+start_nfs_services() {
+    echo "Starting NFS services..."
+    
+    # Start and enable NFS server
+    sudo systemctl enable nfs-server
+    sudo systemctl start nfs-server
+    
+    # Also enable related services
+    sudo systemctl enable rpcbind
+    sudo systemctl start rpcbind
+    
+    echo "NFS services started and enabled"
+    
+    # Show service status
+    sudo systemctl status nfs-server --no-pager --lines=5
+}
+
+# Function to configure SMB/CIFS sharing
+configure_smb_sharing() {
+    echo "Configuring SMB/CIFS sharing..."
+    
+    local smb_config="/etc/samba/smb.conf"
+    local share_name="media"
+    
+    # Backup existing config
+    sudo cp "${smb_config}" "${smb_config}.backup.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
+    
+    # Check if share already exists
+    if sudo grep -q "^\[${share_name}\]" "${smb_config}" 2>/dev/null; then
+        echo "SMB share '${share_name}' already exists, updating..."
+        # Remove existing share section
+        sudo sed -i "/^\[${share_name}\]/,/^\[/{ /^\[${share_name}\]/d; /^\[/!d; }" "${smb_config}"
+    fi
+    
+    # Add media share configuration
+    cat << EOF | sudo tee -a "${smb_config}"
+
+[${share_name}]
+    comment = Media files for Wild Cloud
+    path = ${NFS_MEDIA_PATH}
+    browseable = yes
+    read only = no
+    guest ok = yes
+    create mask = 0664
+    directory mask = 0775
+    force user = $(whoami)
+    force group = $(whoami)
+EOF
+    
+    echo "SMB share configuration added"
+    
+    # Test configuration
+    if sudo testparm -s >/dev/null 2>&1; then
+        echo "✓ SMB configuration is valid"
+    else
+        echo "✗ SMB configuration has errors"
+        sudo testparm
+        exit 1
+    fi
+}
+
+# Function to start SMB services
+start_smb_services() {
+    echo "Starting SMB services..."
+    
+    # Enable and start Samba services
+    sudo systemctl enable smbd
+    sudo systemctl start smbd
+    sudo systemctl enable nmbd
+    sudo systemctl start nmbd
+    
+    echo "SMB services started and enabled"
+    
+    # Show service status
+    sudo systemctl status smbd --no-pager --lines=3
+}
+
+# Function to test NFS setup
+test_nfs_setup() {
+    echo "Testing NFS setup..."
+    
+    # Test if NFS is responding
+    if command -v showmount >/dev/null 2>&1; then
+        echo "Available NFS exports:"
+        showmount -e localhost || echo "Warning: showmount failed, but NFS may still be working"
+    fi
+    
+    # Check if the export directory is accessible
+    if [[ -d "${NFS_MEDIA_PATH}" ]]; then
+        echo "✓ Media directory exists and is accessible"
+    else
+        echo "✗ Media directory not accessible"
+        exit 1
+    fi
+}
+
+# Function to show usage instructions
+show_usage_instructions() {
+    echo
+    echo "=== NFS/SMB Host Setup Complete ==="
+    echo
+    echo "NFS and SMB servers are now running on this host with media directory: ${NFS_MEDIA_PATH}"
+    echo
+    echo "Access methods:"
+    echo "1. NFS (for Kubernetes): Use setup-nfs-k8s.sh to register with cluster"
+    echo "2. SMB/CIFS (for Windows): \\\\${NFS_HOST}\\media"
+    echo
+    echo "To add media files:"
+    echo "- Copy directly to: ${NFS_MEDIA_PATH}"
+    echo "- Or mount SMB share from Windows and copy there"
+    echo
+    echo "Windows SMB mount:"
+    echo "- Open File Explorer"
+    echo "- Map network drive to: \\\\${NFS_HOST}\\media"
+    echo "- Or use: \\\\$(hostname -I | awk '{print $1}')\\media"
+    echo
+    echo "To verify services:"
+    echo "- NFS: showmount -e ${NFS_HOST}"
+    echo "- SMB: smbclient -L ${NFS_HOST} -N"
+    echo "- Status: systemctl status nfs-server smbd"
+    echo
+    echo "Current NFS exports:"
+    sudo exportfs -v
+    echo
+}
+
+# Main execution
+main() {
+    check_host
+    install_nfs_server
+    create_media_directory
+    configure_nfs_exports
+    start_nfs_services
+    configure_smb_sharing
+    start_smb_services
+    test_nfs_setup
+    show_usage_instructions
+}
+
+# Run main function
+main "$@"
--- a/internal/setup/cluster-services/nfs/wild-manifest.yaml
+++ b/internal/setup/cluster-services/nfs/wild-manifest.yaml
@@ -0,0 +1,21 @@
+name: nfs
+description: NFS client provisioner for external NFS storage
+namespace: nfs-system
+category: infrastructure
+
+serviceConfig:
+  nfsHost:
+    path: cloud.nfs.host
+    prompt: "Enter NFS server hostname or IP address"
+    default: "192.168.1.100"
+    type: string
+  mediaPath:
+    path: cloud.nfs.mediaPath
+    prompt: "Enter NFS export path for media storage"
+    default: "/mnt/storage/media"
+    type: string
+  storageCapacity:
+    path: cloud.nfs.storageCapacity
+    prompt: "Enter NFS storage capacity (e.g., 1Ti, 500Gi)"
+    default: "1Ti"
+    type: string
--- a/internal/setup/cluster-services/node-feature-discovery/README.md
+++ b/internal/setup/cluster-services/node-feature-discovery/README.md
--- a/internal/setup/cluster-services/node-feature-discovery/install.sh
+++ b/internal/setup/cluster-services/node-feature-discovery/install.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery"
+
+echo "🔧 === Setting up Node Feature Discovery ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled Node Feature Discovery templates..."
+if [ ! -d "${NFD_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${NFD_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Deploying Node Feature Discovery..."
+kubectl apply -k "${NFD_DIR}/kustomize"
+
+echo "⏳ Waiting for Node Feature Discovery DaemonSet to be ready..."
+kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s
+
+echo ""
+echo "✅ Node Feature Discovery installed successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n node-feature-discovery"
+echo "  kubectl get nodes --show-labels | grep feature.node.kubernetes.io"
+echo ""
+echo "🎮 GPU nodes should now be labeled with GPU device information:"
+echo "  kubectl get nodes --show-labels | grep pci-10de"
--- a/internal/setup/cluster-services/node-feature-discovery/kustomize.template/crds.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/kustomize.template/crds.yaml
@@ -0,0 +1,711 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.3
+  name: nodefeatures.nfd.k8s-sigs.io
+spec:
+  group: nfd.k8s-sigs.io
+  names:
+    kind: NodeFeature
+    listKind: NodeFeatureList
+    plural: nodefeatures
+    singular: nodefeature
+  scope: Namespaced
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: |-
+          NodeFeature resource holds the features discovered for one node in the
+          cluster.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: Specification of the NodeFeature, containing features discovered
+              for a node.
+            properties:
+              features:
+                description: Features is the full "raw" features data that has been
+                  discovered.
+                properties:
+                  attributes:
+                    additionalProperties:
+                      description: AttributeFeatureSet is a set of features having
+                        string value.
+                      properties:
+                        elements:
+                          additionalProperties:
+                            type: string
+                          description: Individual features of the feature set.
+                          type: object
+                      required:
+                      - elements
+                      type: object
+                    description: Attributes contains all the attribute-type features
+                      of the node.
+                    type: object
+                  flags:
+                    additionalProperties:
+                      description: FlagFeatureSet is a set of simple features only
+                        containing names without values.
+                      properties:
+                        elements:
+                          additionalProperties:
+                            description: |-
+                              Nil is a dummy empty struct for protobuf compatibility.
+                              NOTE: protobuf definitions have been removed but this is kept for API compatibility.
+                            type: object
+                          description: Individual features of the feature set.
+                          type: object
+                      required:
+                      - elements
+                      type: object
+                    description: Flags contains all the flag-type features of the
+                      node.
+                    type: object
+                  instances:
+                    additionalProperties:
+                      description: InstanceFeatureSet is a set of features each of
+                        which is an instance having multiple attributes.
+                      properties:
+                        elements:
+                          description: Individual features of the feature set.
+                          items:
+                            description: InstanceFeature represents one instance of
+                              a complex features, e.g. a device.
+                            properties:
+                              attributes:
+                                additionalProperties:
+                                  type: string
+                                description: Attributes of the instance feature.
+                                type: object
+                            required:
+                            - attributes
+                            type: object
+                          type: array
+                      required:
+                      - elements
+                      type: object
+                    description: Instances contains all the instance-type features
+                      of the node.
+                    type: object
+                type: object
+              labels:
+                additionalProperties:
+                  type: string
+                description: Labels is the set of node labels that are requested to
+                  be created.
+                type: object
+            type: object
+        required:
+        - spec
+        type: object
+    served: true
+    storage: true
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.3
+  name: nodefeaturegroups.nfd.k8s-sigs.io
+spec:
+  group: nfd.k8s-sigs.io
+  names:
+    kind: NodeFeatureGroup
+    listKind: NodeFeatureGroupList
+    plural: nodefeaturegroups
+    shortNames:
+    - nfg
+    singular: nodefeaturegroup
+  scope: Namespaced
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: NodeFeatureGroup resource holds Node pools by featureGroup
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: Spec defines the rules to be evaluated.
+            properties:
+              featureGroupRules:
+                description: List of rules to evaluate to determine nodes that belong
+                  in this group.
+                items:
+                  description: GroupRule defines a rule for nodegroup filtering.
+                  properties:
+                    matchAny:
+                      description: MatchAny specifies a list of matchers one of which
+                        must match.
+                      items:
+                        description: MatchAnyElem specifies one sub-matcher of MatchAny.
+                        properties:
+                          matchFeatures:
+                            description: MatchFeatures specifies a set of matcher
+                              terms all of which must match.
+                            items:
+                              description: |-
+                                FeatureMatcherTerm defines requirements against one feature set. All
+                                requirements (specified as MatchExpressions) are evaluated against each
+                                element in the feature set.
+                              properties:
+                                feature:
+                                  description: Feature is the name of the feature
+                                    set to match against.
+                                  type: string
+                                matchExpressions:
+                                  additionalProperties:
+                                    description: |-
+                                      MatchExpression specifies an expression to evaluate against a set of input
+                                      values. It contains an operator that is applied when matching the input and
+                                      an array of values that the operator evaluates the input against.
+                                    properties:
+                                      op:
+                                        description: Op is the operator to be applied.
+                                        enum:
+                                        - In
+                                        - NotIn
+                                        - InRegexp
+                                        - Exists
+                                        - DoesNotExist
+                                        - Gt
+                                        - Lt
+                                        - GtLt
+                                        - IsTrue
+                                        - IsFalse
+                                        type: string
+                                      value:
+                                        description: |-
+                                          Value is the list of values that the operand evaluates the input
+                                          against. Value should be empty if the operator is Exists, DoesNotExist,
+                                          IsTrue or IsFalse. Value should contain exactly one element if the
+                                          operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                          In other cases Value should contain at least one element.
+                                        items:
+                                          type: string
+                                        type: array
+                                    required:
+                                    - op
+                                    type: object
+                                  description: |-
+                                    MatchExpressions is the set of per-element expressions evaluated. These
+                                    match against the value of the specified elements.
+                                  type: object
+                                matchName:
+                                  description: |-
+                                    MatchName in an expression that is matched against the name of each
+                                    element in the feature set.
+                                  properties:
+                                    op:
+                                      description: Op is the operator to be applied.
+                                      enum:
+                                      - In
+                                      - NotIn
+                                      - InRegexp
+                                      - Exists
+                                      - DoesNotExist
+                                      - Gt
+                                      - Lt
+                                      - GtLt
+                                      - IsTrue
+                                      - IsFalse
+                                      type: string
+                                    value:
+                                      description: |-
+                                        Value is the list of values that the operand evaluates the input
+                                        against. Value should be empty if the operator is Exists, DoesNotExist,
+                                        IsTrue or IsFalse. Value should contain exactly one element if the
+                                        operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                        In other cases Value should contain at least one element.
+                                      items:
+                                        type: string
+                                      type: array
+                                  required:
+                                  - op
+                                  type: object
+                              required:
+                              - feature
+                              type: object
+                            type: array
+                        required:
+                        - matchFeatures
+                        type: object
+                      type: array
+                    matchFeatures:
+                      description: MatchFeatures specifies a set of matcher terms
+                        all of which must match.
+                      items:
+                        description: |-
+                          FeatureMatcherTerm defines requirements against one feature set. All
+                          requirements (specified as MatchExpressions) are evaluated against each
+                          element in the feature set.
+                        properties:
+                          feature:
+                            description: Feature is the name of the feature set to
+                              match against.
+                            type: string
+                          matchExpressions:
+                            additionalProperties:
+                              description: |-
+                                MatchExpression specifies an expression to evaluate against a set of input
+                                values. It contains an operator that is applied when matching the input and
+                                an array of values that the operator evaluates the input against.
+                              properties:
+                                op:
+                                  description: Op is the operator to be applied.
+                                  enum:
+                                  - In
+                                  - NotIn
+                                  - InRegexp
+                                  - Exists
+                                  - DoesNotExist
+                                  - Gt
+                                  - Lt
+                                  - GtLt
+                                  - IsTrue
+                                  - IsFalse
+                                  type: string
+                                value:
+                                  description: |-
+                                    Value is the list of values that the operand evaluates the input
+                                    against. Value should be empty if the operator is Exists, DoesNotExist,
+                                    IsTrue or IsFalse. Value should contain exactly one element if the
+                                    operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                    In other cases Value should contain at least one element.
+                                  items:
+                                    type: string
+                                  type: array
+                              required:
+                              - op
+                              type: object
+                            description: |-
+                              MatchExpressions is the set of per-element expressions evaluated. These
+                              match against the value of the specified elements.
+                            type: object
+                          matchName:
+                            description: |-
+                              MatchName in an expression that is matched against the name of each
+                              element in the feature set.
+                            properties:
+                              op:
+                                description: Op is the operator to be applied.
+                                enum:
+                                - In
+                                - NotIn
+                                - InRegexp
+                                - Exists
+                                - DoesNotExist
+                                - Gt
+                                - Lt
+                                - GtLt
+                                - IsTrue
+                                - IsFalse
+                                type: string
+                              value:
+                                description: |-
+                                  Value is the list of values that the operand evaluates the input
+                                  against. Value should be empty if the operator is Exists, DoesNotExist,
+                                  IsTrue or IsFalse. Value should contain exactly one element if the
+                                  operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                  In other cases Value should contain at least one element.
+                                items:
+                                  type: string
+                                type: array
+                            required:
+                            - op
+                            type: object
+                        required:
+                        - feature
+                        type: object
+                      type: array
+                    name:
+                      description: Name of the rule.
+                      type: string
+                  required:
+                  - name
+                  type: object
+                type: array
+            required:
+            - featureGroupRules
+            type: object
+          status:
+            description: |-
+              Status of the NodeFeatureGroup after the most recent evaluation of the
+              specification.
+            properties:
+              nodes:
+                description: Nodes is a list of FeatureGroupNode in the cluster that
+                  match the featureGroupRules
+                items:
+                  properties:
+                    name:
+                      description: Name of the node.
+                      type: string
+                  required:
+                  - name
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - name
+                x-kubernetes-list-type: map
+            type: object
+        required:
+        - spec
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.3
+  name: nodefeaturerules.nfd.k8s-sigs.io
+spec:
+  group: nfd.k8s-sigs.io
+  names:
+    kind: NodeFeatureRule
+    listKind: NodeFeatureRuleList
+    plural: nodefeaturerules
+    shortNames:
+    - nfr
+    singular: nodefeaturerule
+  scope: Cluster
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: |-
+          NodeFeatureRule resource specifies a configuration for feature-based
+          customization of node objects, such as node labeling.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: Spec defines the rules to be evaluated.
+            properties:
+              rules:
+                description: Rules is a list of node customization rules.
+                items:
+                  description: Rule defines a rule for node customization such as
+                    labeling.
+                  properties:
+                    annotations:
+                      additionalProperties:
+                        type: string
+                      description: Annotations to create if the rule matches.
+                      type: object
+                    extendedResources:
+                      additionalProperties:
+                        type: string
+                      description: ExtendedResources to create if the rule matches.
+                      type: object
+                    labels:
+                      additionalProperties:
+                        type: string
+                      description: Labels to create if the rule matches.
+                      type: object
+                    labelsTemplate:
+                      description: |-
+                        LabelsTemplate specifies a template to expand for dynamically generating
+                        multiple labels. Data (after template expansion) must be keys with an
+                        optional value (<key>[=<value>]) separated by newlines.
+                      type: string
+                    matchAny:
+                      description: MatchAny specifies a list of matchers one of which
+                        must match.
+                      items:
+                        description: MatchAnyElem specifies one sub-matcher of MatchAny.
+                        properties:
+                          matchFeatures:
+                            description: MatchFeatures specifies a set of matcher
+                              terms all of which must match.
+                            items:
+                              description: |-
+                                FeatureMatcherTerm defines requirements against one feature set. All
+                                requirements (specified as MatchExpressions) are evaluated against each
+                                element in the feature set.
+                              properties:
+                                feature:
+                                  description: Feature is the name of the feature
+                                    set to match against.
+                                  type: string
+                                matchExpressions:
+                                  additionalProperties:
+                                    description: |-
+                                      MatchExpression specifies an expression to evaluate against a set of input
+                                      values. It contains an operator that is applied when matching the input and
+                                      an array of values that the operator evaluates the input against.
+                                    properties:
+                                      op:
+                                        description: Op is the operator to be applied.
+                                        enum:
+                                        - In
+                                        - NotIn
+                                        - InRegexp
+                                        - Exists
+                                        - DoesNotExist
+                                        - Gt
+                                        - Lt
+                                        - GtLt
+                                        - IsTrue
+                                        - IsFalse
+                                        type: string
+                                      value:
+                                        description: |-
+                                          Value is the list of values that the operand evaluates the input
+                                          against. Value should be empty if the operator is Exists, DoesNotExist,
+                                          IsTrue or IsFalse. Value should contain exactly one element if the
+                                          operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                          In other cases Value should contain at least one element.
+                                        items:
+                                          type: string
+                                        type: array
+                                    required:
+                                    - op
+                                    type: object
+                                  description: |-
+                                    MatchExpressions is the set of per-element expressions evaluated. These
+                                    match against the value of the specified elements.
+                                  type: object
+                                matchName:
+                                  description: |-
+                                    MatchName in an expression that is matched against the name of each
+                                    element in the feature set.
+                                  properties:
+                                    op:
+                                      description: Op is the operator to be applied.
+                                      enum:
+                                      - In
+                                      - NotIn
+                                      - InRegexp
+                                      - Exists
+                                      - DoesNotExist
+                                      - Gt
+                                      - Lt
+                                      - GtLt
+                                      - IsTrue
+                                      - IsFalse
+                                      type: string
+                                    value:
+                                      description: |-
+                                        Value is the list of values that the operand evaluates the input
+                                        against. Value should be empty if the operator is Exists, DoesNotExist,
+                                        IsTrue or IsFalse. Value should contain exactly one element if the
+                                        operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                        In other cases Value should contain at least one element.
+                                      items:
+                                        type: string
+                                      type: array
+                                  required:
+                                  - op
+                                  type: object
+                              required:
+                              - feature
+                              type: object
+                            type: array
+                        required:
+                        - matchFeatures
+                        type: object
+                      type: array
+                    matchFeatures:
+                      description: MatchFeatures specifies a set of matcher terms
+                        all of which must match.
+                      items:
+                        description: |-
+                          FeatureMatcherTerm defines requirements against one feature set. All
+                          requirements (specified as MatchExpressions) are evaluated against each
+                          element in the feature set.
+                        properties:
+                          feature:
+                            description: Feature is the name of the feature set to
+                              match against.
+                            type: string
+                          matchExpressions:
+                            additionalProperties:
+                              description: |-
+                                MatchExpression specifies an expression to evaluate against a set of input
+                                values. It contains an operator that is applied when matching the input and
+                                an array of values that the operator evaluates the input against.
+                              properties:
+                                op:
+                                  description: Op is the operator to be applied.
+                                  enum:
+                                  - In
+                                  - NotIn
+                                  - InRegexp
+                                  - Exists
+                                  - DoesNotExist
+                                  - Gt
+                                  - Lt
+                                  - GtLt
+                                  - IsTrue
+                                  - IsFalse
+                                  type: string
+                                value:
+                                  description: |-
+                                    Value is the list of values that the operand evaluates the input
+                                    against. Value should be empty if the operator is Exists, DoesNotExist,
+                                    IsTrue or IsFalse. Value should contain exactly one element if the
+                                    operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                    In other cases Value should contain at least one element.
+                                  items:
+                                    type: string
+                                  type: array
+                              required:
+                              - op
+                              type: object
+                            description: |-
+                              MatchExpressions is the set of per-element expressions evaluated. These
+                              match against the value of the specified elements.
+                            type: object
+                          matchName:
+                            description: |-
+                              MatchName in an expression that is matched against the name of each
+                              element in the feature set.
+                            properties:
+                              op:
+                                description: Op is the operator to be applied.
+                                enum:
+                                - In
+                                - NotIn
+                                - InRegexp
+                                - Exists
+                                - DoesNotExist
+                                - Gt
+                                - Lt
+                                - GtLt
+                                - IsTrue
+                                - IsFalse
+                                type: string
+                              value:
+                                description: |-
+                                  Value is the list of values that the operand evaluates the input
+                                  against. Value should be empty if the operator is Exists, DoesNotExist,
+                                  IsTrue or IsFalse. Value should contain exactly one element if the
+                                  operator is Gt or Lt and exactly two elements if the operator is GtLt.
+                                  In other cases Value should contain at least one element.
+                                items:
+                                  type: string
+                                type: array
+                            required:
+                            - op
+                            type: object
+                        required:
+                        - feature
+                        type: object
+                      type: array
+                    name:
+                      description: Name of the rule.
+                      type: string
+                    taints:
+                      description: Taints to create if the rule matches.
+                      items:
+                        description: |-
+                          The node this Taint is attached to has the "effect" on
+                          any pod that does not tolerate the Taint.
+                        properties:
+                          effect:
+                            description: |-
+                              Required. The effect of the taint on pods
+                              that do not tolerate the taint.
+                              Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
+                            type: string
+                          key:
+                            description: Required. The taint key to be applied to
+                              a node.
+                            type: string
+                          timeAdded:
+                            description: |-
+                              TimeAdded represents the time at which the taint was added.
+                              It is only written for NoExecute taints.
+                            format: date-time
+                            type: string
+                          value:
+                            description: The taint value corresponding to the taint
+                              key.
+                            type: string
+                        required:
+                        - effect
+                        - key
+                        type: object
+                      type: array
+                    vars:
+                      additionalProperties:
+                        type: string
+                      description: |-
+                        Vars is the variables to store if the rule matches. Variables do not
+                        directly inflict any changes in the node object. However, they can be
+                        referenced from other rules enabling more complex rule hierarchies,
+                        without exposing intermediary output values as labels.
+                      type: object
+                    varsTemplate:
+                      description: |-
+                        VarsTemplate specifies a template to expand for dynamically generating
+                        multiple variables. Data (after template expansion) must be keys with an
+                        optional value (<key>[=<value>]) separated by newlines.
+                      type: string
+                  required:
+                  - name
+                  type: object
+                type: array
+            required:
+            - rules
+            type: object
+        required:
+        - spec
+        type: object
+    served: true
+    storage: true
--- a/internal/setup/cluster-services/node-feature-discovery/kustomize.template/daemonset.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/kustomize.template/daemonset.yaml
@@ -0,0 +1,86 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-feature-discovery-worker
+  namespace: node-feature-discovery
+spec:
+  selector:
+    matchLabels:
+      name: node-feature-discovery-worker
+  template:
+    metadata:
+      labels:
+        name: node-feature-discovery-worker
+    spec:
+      serviceAccountName: node-feature-discovery
+      securityContext:
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+      - name: worker
+        image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+          readOnlyRootFilesystem: true
+          runAsNonRoot: true
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        resources:
+          limits:
+            memory: 512Mi
+          requests:
+            cpu: 5m
+            memory: 64Mi
+        command:
+        - "nfd-worker"
+        args:
+        - "-metrics=8081"
+        - "-grpc-health=8082"
+        ports:
+        - containerPort: 8081
+          name: metrics
+        - containerPort: 8082
+          name: health
+        volumeMounts:
+        - name: host-boot
+          mountPath: "/host-boot"
+          readOnly: true
+        - name: host-os-release
+          mountPath: "/host-etc/os-release"
+          readOnly: true
+        - name: host-sys
+          mountPath: "/host-sys"
+          readOnly: true
+        - name: host-usr-lib
+          mountPath: "/host-usr/lib"
+          readOnly: true
+        - name: host-lib
+          mountPath: "/host-lib"
+          readOnly: true
+        - name: host-proc-swaps
+          mountPath: "/host-proc/swaps"
+          readOnly: true
+      volumes:
+      - name: host-boot
+        hostPath:
+          path: "/boot"
+      - name: host-os-release
+        hostPath:
+          path: "/etc/os-release"
+      - name: host-sys
+        hostPath:
+          path: "/sys"
+      - name: host-usr-lib
+        hostPath:
+          path: "/usr/lib"
+      - name: host-lib
+        hostPath:
+          path: "/lib"
+      - name: host-proc-swaps
+        hostPath:
+          path: "/proc/swaps"
--- a/internal/setup/cluster-services/node-feature-discovery/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/kustomize.template/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: node-feature-discovery
+labels:
+  - pairs:
+      app.kubernetes.io/name: node-feature-discovery
+      managedBy: kustomize
+      partOf: wild-cloud
+resources:
+  - namespace.yaml
+  - crds.yaml
+  - rbac.yaml
+  - daemonset.yaml
+  - master.yaml
--- a/internal/setup/cluster-services/node-feature-discovery/kustomize.template/master.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/kustomize.template/master.yaml
@@ -0,0 +1,49 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: node-feature-discovery-master
+  namespace: node-feature-discovery
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      name: node-feature-discovery-master
+  template:
+    metadata:
+      labels:
+        name: node-feature-discovery-master
+    spec:
+      serviceAccountName: node-feature-discovery
+      securityContext:
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+      - name: master
+        image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+          readOnlyRootFilesystem: true
+          runAsNonRoot: true
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        command:
+        - "nfd-master"
+        args:
+        - "-metrics=8081"
+        - "-grpc-health=8082"
+        ports:
+        - containerPort: 8081
+          name: metrics
+        - containerPort: 8082
+          name: health
+        resources:
+          requests:
+            cpu: 10m
+            memory: 64Mi
+          limits:
+            memory: 128Mi
--- a/internal/setup/cluster-services/node-feature-discovery/kustomize.template/namespace.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/kustomize.template/namespace.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: node-feature-discovery
+  labels:
+    pod-security.kubernetes.io/enforce: privileged
+    pod-security.kubernetes.io/audit: privileged
+    pod-security.kubernetes.io/warn: privileged
--- a/internal/setup/cluster-services/node-feature-discovery/kustomize.template/rbac.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/kustomize.template/rbac.yaml
@@ -0,0 +1,55 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: node-feature-discovery
+  namespace: node-feature-discovery
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: node-feature-discovery
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+  - nodes/status
+  verbs:
+  - get
+  - patch
+  - update
+  - list
+- apiGroups:
+  - ""
+  resources:
+  - namespaces
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - nfd.k8s-sigs.io
+  resources:
+  - nodefeatures
+  - nodefeaturerules
+  - nodefeaturegroups
+  verbs:
+  - get
+  - list
+  - watch
+  - create
+  - update
+  - patch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: node-feature-discovery
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: node-feature-discovery
+subjects:
+- kind: ServiceAccount
+  name: node-feature-discovery
+  namespace: node-feature-discovery
--- a/internal/setup/cluster-services/node-feature-discovery/wild-manifest.yaml
+++ b/internal/setup/cluster-services/node-feature-discovery/wild-manifest.yaml
@@ -0,0 +1,4 @@
+name: node-feature-discovery
+description: Detects hardware features available on each node
+namespace: node-feature-discovery
+category: infrastructure
--- a/internal/setup/cluster-services/nvidia-device-plugin/README.md
+++ b/internal/setup/cluster-services/nvidia-device-plugin/README.md
@@ -0,0 +1,98 @@
+# NVIDIA Device Plugin
+
+The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
+
+## Overview
+
+This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
+- Discovers NVIDIA GPUs on worker nodes
+- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
+- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
+- Enables pods to request GPU resources
+
+## Prerequisites
+
+Before installing the NVIDIA Device Plugin, ensure that:
+
+1. **NVIDIA Drivers** are installed (>= 384.81)
+2. **nvidia-container-toolkit** is installed (>= 1.7.0)
+3. **nvidia-container-runtime** is configured as the default container runtime
+4. Worker nodes have NVIDIA GPUs
+
+### Talos Linux Requirements
+
+For Talos Linux nodes, you need:
+- NVIDIA drivers extension in the Talos schematic
+- nvidia-container-toolkit extension
+- Proper container runtime configuration
+
+## Installation
+
+```bash
+# Configure and install the service
+wild-cluster-services-configure nvidia-device-plugin
+wild-cluster-install nvidia-device-plugin
+```
+
+## Verification
+
+After installation, verify the plugin is working:
+
+```bash
+# Check plugin pods are running
+kubectl get pods -n kube-system | grep nvidia
+
+# Verify GPU resources are advertised
+kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
+
+# Check GPU node labels
+kubectl get nodes --show-labels | grep nvidia
+```
+
+## Usage in Applications
+
+Once installed, applications can request GPU resources:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-app
+spec:
+  template:
+    spec:
+      containers:
+      - name: app
+        image: nvidia/cuda:latest
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+```
+
+## Troubleshooting
+
+### Plugin Not Starting
+- Verify NVIDIA drivers are installed on worker nodes
+- Check that nvidia-container-toolkit is properly configured
+- Ensure worker nodes are not tainted in a way that prevents scheduling
+
+### No GPU Resources Advertised
+- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
+- Verify NVIDIA runtime is the default container runtime
+- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
+
+## Configuration
+
+The plugin uses the following configuration:
+- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
+- **Namespace**: `kube-system`
+- **Priority Class**: `system-node-critical`
+- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
+
+## References
+
+- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
+- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
+- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)
--- a/internal/setup/cluster-services/nvidia-device-plugin/install.sh
+++ b/internal/setup/cluster-services/nvidia-device-plugin/install.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
+
+echo "🎮 === Setting up NVIDIA Device Plugin ==="
+echo ""
+
+# Check if we have NVIDIA GPUs in the cluster
+echo "🔍 Checking for worker nodes in the cluster..."
+
+# Check if any worker nodes exist (device plugin only runs on worker nodes)
+WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
+if [ "$WORKER_NODES" -eq 0 ]; then
+    echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
+    exit 1
+fi
+
+echo "✅ Found $WORKER_NODES worker node(s)"
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
+if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Deploying NVIDIA Device Plugin..."
+kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
+
+echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
+kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
+
+echo ""
+echo "✅ NVIDIA Device Plugin installed successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n kube-system | grep nvidia"
+echo "  kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
+echo ""
+echo "🎮 GPU nodes should now be labeled with GPU product information:"
+echo "  kubectl get nodes --show-labels | grep nvidia"
+echo ""
--- a/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/daemonset.yaml
@@ -0,0 +1,91 @@
+# NVIDIA Device Plugin DaemonSet
+# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
+# Licensed under the Apache License, Version 2.0
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: nvidia-device-plugin
+    app.kubernetes.io/component: device-plugin
+    managedBy: kustomize
+    partOf: wild-cloud
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-ds
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-ds
+        app.kubernetes.io/name: nvidia-device-plugin
+        app.kubernetes.io/component: device-plugin
+    spec:
+      runtimeClassName: nvidia
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      - key: CriticalAddonsOnly
+        operator: Exists
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0300_10de.present
+                operator: In
+                values:
+                - "true"
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      securityContext:
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: MPS_ROOT
+            value: /run/nvidia/mps
+          - name: NVIDIA_VISIBLE_DEVICES
+            value: all
+          - name: NVIDIA_DRIVER_CAPABILITIES
+            value: compute,utility
+          - name: FAIL_ON_INIT_ERROR
+            value: "false"
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+        - name: mps-shm
+          mountPath: /dev/shm
+        - name: mps-root
+          mountPath: /mps
+        - name: cdi-root
+          mountPath: /var/run/cdi
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
+      - name: mps-root
+        hostPath:
+          path: /run/nvidia/mps
+          type: DirectoryOrCreate
+      - name: mps-shm
+        hostPath:
+          path: /run/nvidia/mps/shm
+      - name: cdi-root
+        hostPath:
+          path: /var/run/cdi
+          type: DirectoryOrCreate
--- a/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/kustomization.yaml
@@ -0,0 +1,12 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: kube-system
+resources:
+  - daemonset.yaml
+  - runtimeclass.yaml
+labels:
+  - pairs:
+      app.kubernetes.io/name: nvidia-device-plugin
+      app.kubernetes.io/component: device-plugin
+      managedBy: kustomize
+      partOf: wild-cloud
--- a/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/kustomize.template/runtimeclass.yaml
@@ -0,0 +1,5 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
--- a/internal/setup/cluster-services/nvidia-device-plugin/wild-manifest.yaml
+++ b/internal/setup/cluster-services/nvidia-device-plugin/wild-manifest.yaml
@@ -0,0 +1,7 @@
+name: nvidia-device-plugin
+description: NVIDIA device plugin for Kubernetes
+namespace: nvidia-device-plugin
+category: infrastructure
+
+dependencies:
+  - node-feature-discovery
--- a/internal/setup/cluster-services/smtp/README.md
+++ b/internal/setup/cluster-services/smtp/README.md
@@ -0,0 +1,51 @@
+# SMTP Configuration Service
+
+This service configures SMTP settings for Wild Cloud applications to send transactional emails.
+
+## Overview
+
+The SMTP service doesn't deploy any Kubernetes resources. Instead, it helps configure global SMTP settings that can be used by Wild Cloud applications like Ghost, Gitea, and others for sending:
+
+- Password reset emails
+- User invitation emails  
+- Notification emails
+- Other transactional emails
+
+## Installation
+
+```bash
+./setup/cluster-services/smtp/install.sh
+```
+
+## Configuration
+
+The setup script will prompt for:
+
+- **SMTP Host**: Your email provider's SMTP server (e.g., `email-smtp.us-east-2.amazonaws.com` for AWS SES)
+- **SMTP Port**: Usually `465` for SSL or `587` for STARTTLS
+- **SMTP User**: Username or access key for authentication
+- **From Address**: Default sender email address
+- **SMTP Password**: Your password, secret key, or API key (entered securely)
+
+## Supported Providers
+
+- **AWS SES**: Use your Access Key ID as user and Secret Access Key as password
+- **Gmail/Google Workspace**: Use your email as user and an App Password as password
+- **SendGrid**: Use `apikey` as user and your API key as password
+- **Mailgun**: Use your Mailgun username and password
+- **Other SMTP providers**: Use your standard SMTP credentials
+
+## Applications That Use SMTP
+
+- **Ghost**: User management, password resets, notifications
+- **Gitea**: User registration, password resets, notifications
+- **OpenProject**: User invitations, notifications
+- **Future applications**: Any app that needs to send emails
+
+## Testing
+
+After configuration, test SMTP by:
+
+1. Deploying an application that uses email (like Ghost)
+2. Using password reset or user invitation features
+3. Checking application logs for SMTP connection issues
--- a/internal/setup/cluster-services/smtp/wild-manifest.yaml
+++ b/internal/setup/cluster-services/smtp/wild-manifest.yaml
@@ -0,0 +1,36 @@
+name: smtp
+description: SMTP relay service for cluster applications
+namespace: smtp-system
+category: infrastructure
+
+serviceConfig:
+  smtpHost:
+    path: cloud.smtp.host
+    prompt: "Enter SMTP host (e.g., email-smtp.us-east-2.amazonaws.com for AWS SES)"
+    default: ""
+    type: string
+  smtpPort:
+    path: cloud.smtp.port
+    prompt: "Enter SMTP port (usually 465 for SSL, 587 for STARTTLS)"
+    default: "465"
+    type: string
+  smtpUser:
+    path: cloud.smtp.user
+    prompt: "Enter SMTP username/access key"
+    default: ""
+    type: string
+  smtpFrom:
+    path: cloud.smtp.from
+    prompt: "Enter default 'from' email address"
+    default: "no-reply@{{ .cloud.domain }}"
+    type: string
+  smtpTls:
+    path: cloud.smtp.tls
+    prompt: "Enable TLS? (true/false)"
+    default: "true"
+    type: string
+  smtpStartTls:
+    path: cloud.smtp.startTls
+    prompt: "Enable STARTTLS? (true/false)"
+    default: "true"
+    type: string
--- a/internal/setup/cluster-services/traefik/README.md
+++ b/internal/setup/cluster-services/traefik/README.md
@@ -0,0 +1,31 @@
+# Traefik
+
+- https://doc.traefik.io/traefik/providers/kubernetes-ingress/
+
+Ingress RDs can be create for any service. The routes specificed in the Ingress are added automatically to the Traefik proxy.
+
+Traefik serves all incoming network traffic on ports 80 and 443 to their appropriate services based on the route.
+
+## Notes
+
+These kustomize templates were created with:
+
+```bash
+helm-chart-to-kustomize traefik/traefik traefik traefik values.yaml
+```
+
+With values.yaml being:
+
+```yaml
+ingressRoute:
+  dashboard:
+    enabled: true
+    matchRule: Host(`dashboard.localhost`)
+    entryPoints:
+      - web
+providers:
+  kubernetesGateway:
+    enabled: true
+gateway:
+  namespacePolicy: All
+```
--- a/internal/setup/cluster-services/traefik/install.sh
+++ b/internal/setup/cluster-services/traefik/install.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+TRAEFIK_DIR="${CLUSTER_SETUP_DIR}/traefik"
+
+echo "🌐 === Setting up Traefik Ingress Controller ==="
+echo ""
+
+# Check MetalLB dependency
+echo "🔍 Verifying MetalLB is ready (required for Traefik LoadBalancer service)..."
+kubectl wait --for=condition=Ready pod -l component=controller -n metallb-system --timeout=60s 2>/dev/null || {
+    echo "⚠️  MetalLB controller not ready, but continuing with Traefik installation"
+    echo "💡 Note: Traefik LoadBalancer service may not get external IP without MetalLB"
+}
+
+# Install required CRDs first
+echo "📦 Installing Gateway API CRDs..."
+kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml
+
+echo "📦 Installing Traefik CRDs..."
+kubectl apply -f https://raw.githubusercontent.com/traefik/traefik/v3.4/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml
+
+echo "⏳ Waiting for CRDs to be established..."
+kubectl wait --for condition=established crd/gateways.gateway.networking.k8s.io --timeout=60s
+kubectl wait --for condition=established crd/gatewayclasses.gateway.networking.k8s.io --timeout=60s
+kubectl wait --for condition=established crd/ingressroutes.traefik.io --timeout=60s
+kubectl wait --for condition=established crd/middlewares.traefik.io --timeout=60s
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled Traefik templates..."
+if [ ! -d "${TRAEFIK_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${TRAEFIK_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+# Apply Traefik using kustomize
+echo "🚀 Deploying Traefik..."
+kubectl apply -k ${TRAEFIK_DIR}/kustomize
+
+# Wait for Traefik to be ready
+echo "⏳ Waiting for Traefik to be ready..."
+kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=120s
+
+echo ""
+echo "✅ Traefik installed successfully"
+echo ""
+echo "💡 To verify the installation:"
+echo "  kubectl get pods -n traefik"
+echo "  kubectl get svc -n traefik"
+echo ""
--- a/internal/setup/cluster-services/traefik/kustomize.template/internal-middleware.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/internal-middleware.yaml
@@ -0,0 +1,13 @@
+apiVersion: traefik.containo.us/v1alpha1
+kind: Middleware
+metadata:
+  name: internal-only
+  namespace: kube-system
+spec:
+  ipWhiteList:
+    # Restrict to local private network ranges - adjust these to match your network
+    sourceRange:
+      - 127.0.0.1/32    # localhost
+      - 10.0.0.0/8      # Private network
+      - 172.16.0.0/12   # Private network
+      - 192.168.0.0/16  # Private network
--- a/internal/setup/cluster-services/traefik/kustomize.template/kustomization.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/kustomization.yaml
@@ -0,0 +1,13 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- namespace.yaml
+- templates/deployment.yaml
+- templates/gatewayclass.yaml
+- templates/gateway.yaml
+- templates/ingressclass.yaml
+- templates/ingressroute.yaml
+- templates/rbac/clusterrolebinding.yaml
+- templates/rbac/clusterrole.yaml
+- templates/rbac/serviceaccount.yaml
+- templates/service.yaml
--- a/internal/setup/cluster-services/traefik/kustomize.template/namespace.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: traefik
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/deployment.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/deployment.yaml
@@ -0,0 +1,130 @@
+---
+# Source: traefik/templates/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: traefik
+  namespace: traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: traefik
+      app.kubernetes.io/instance: traefik-traefik
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: 1
+  minReadySeconds: 0
+  template: 
+    metadata:
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/path: "/metrics"
+        prometheus.io/port: "9100"
+      labels:
+        app.kubernetes.io/name: traefik
+        app.kubernetes.io/instance: traefik-traefik
+        helm.sh/chart: traefik-36.1.0
+        app.kubernetes.io/managed-by: Helm
+    spec:
+      serviceAccountName: traefik
+      automountServiceAccountToken: true
+      terminationGracePeriodSeconds: 60
+      hostNetwork: false
+      containers:
+      - image: docker.io/traefik:v3.4.1
+        imagePullPolicy: IfNotPresent
+        name: traefik
+        resources:
+        readinessProbe:
+          httpGet:
+            path: /ping
+            port: 8080
+            scheme: HTTP
+          failureThreshold: 1
+          initialDelaySeconds: 2
+          periodSeconds: 10
+          successThreshold: 1
+          timeoutSeconds: 2
+        livenessProbe:
+          httpGet:
+            path: /ping
+            port: 8080
+            scheme: HTTP
+          failureThreshold: 3
+          initialDelaySeconds: 2
+          periodSeconds: 10
+          successThreshold: 1
+          timeoutSeconds: 2
+        lifecycle:
+        ports:
+        - name: metrics
+          containerPort: 9100
+          protocol: TCP
+        - name: traefik
+          containerPort: 8080
+          protocol: TCP
+        - name: web
+          containerPort: 8000
+          protocol: TCP
+        - name: websecure
+          containerPort: 8443
+          protocol: TCP
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+          readOnlyRootFilesystem: true
+        volumeMounts:
+          - name: data
+            mountPath: /data
+          - name: tmp
+            mountPath: /tmp
+        args:
+          - "--global.checkNewVersion"
+          - "--entryPoints.metrics.address=:9100/tcp"
+          - "--entryPoints.traefik.address=:8080/tcp"
+          - "--entryPoints.web.address=:8000/tcp"
+          - "--entryPoints.websecure.address=:8443/tcp"
+          - "--api.dashboard=true"
+          - "--ping=true"
+          - "--metrics.prometheus=true"
+          - "--metrics.prometheus.entrypoint=metrics"
+          - "--providers.kubernetescrd"
+          - "--providers.kubernetescrd.allowEmptyServices=true"
+          - "--providers.kubernetesingress"
+          - "--providers.kubernetesingress.allowEmptyServices=true"
+          - "--providers.kubernetesingress.ingressendpoint.publishedservice=traefik/traefik"
+          - "--providers.kubernetesgateway"
+          - "--providers.kubernetesgateway.statusaddress.service.name=traefik"
+          - "--providers.kubernetesgateway.statusaddress.service.namespace=traefik"
+          - "--entryPoints.websecure.http.tls=true"
+          - "--log.level=INFO"
+          
+        env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+      volumes:
+        - name: data
+          emptyDir: {}
+        - name: tmp
+          emptyDir: {}
+      securityContext:
+        runAsGroup: 65532
+        runAsNonRoot: true
+        runAsUser: 65532
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/gateway.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/gateway.yaml
@@ -0,0 +1,18 @@
+---
+# Source: traefik/templates/gateway.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: traefik-gateway
+  namespace: traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+spec:
+  gatewayClassName: traefik
+  listeners:
+    - name: web
+      port: 8000
+      protocol: HTTP
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/gatewayclass.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/gatewayclass.yaml
@@ -0,0 +1,13 @@
+---
+# Source: traefik/templates/gatewayclass.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: GatewayClass
+metadata:
+  name: traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+spec:
+  controllerName: traefik.io/gateway-controller
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/ingressclass.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/ingressclass.yaml
@@ -0,0 +1,15 @@
+---
+# Source: traefik/templates/ingressclass.yaml
+apiVersion: networking.k8s.io/v1
+kind: IngressClass
+metadata:
+  annotations:
+    ingressclass.kubernetes.io/is-default-class: "true"
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+  name: traefik
+spec:
+  controller: traefik.io/ingress-controller
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/ingressroute.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/ingressroute.yaml
@@ -0,0 +1,21 @@
+---
+# Source: traefik/templates/ingressroute.yaml
+apiVersion: traefik.io/v1alpha1
+kind: IngressRoute
+metadata:
+  name: traefik-dashboard
+  namespace: traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+spec:
+  entryPoints:
+  - web
+  routes:
+  - match: Host(`dashboard.localhost`)
+    kind: Rule
+    services:
+      - kind: TraefikService
+        name: api@internal
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/rbac/clusterrole.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/rbac/clusterrole.yaml
@@ -0,0 +1,108 @@
+---
+# Source: traefik/templates/rbac/clusterrole.yaml
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: traefik-traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - configmaps
+      - nodes
+      - services
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - discovery.k8s.io
+    resources:
+      - endpointslices
+    verbs:
+      - list
+      - watch
+  - apiGroups:
+      - ""
+    resources:
+      - secrets
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - extensions
+      - networking.k8s.io
+    resources:
+      - ingressclasses
+      - ingresses
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - extensions
+      - networking.k8s.io
+    resources:
+      - ingresses/status
+    verbs:
+      - update
+  - apiGroups:
+      - traefik.io
+    resources:
+      - ingressroutes
+      - ingressroutetcps
+      - ingressrouteudps
+      - middlewares
+      - middlewaretcps
+      - serverstransports
+      - serverstransporttcps
+      - tlsoptions
+      - tlsstores
+      - traefikservices
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - ""
+    resources:
+      - namespaces
+      - secrets
+      - configmaps
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - gateway.networking.k8s.io
+    resources:
+      - backendtlspolicies
+      - gatewayclasses
+      - gateways
+      - grpcroutes
+      - httproutes
+      - referencegrants
+      - tcproutes
+      - tlsroutes
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - gateway.networking.k8s.io
+    resources:
+      - backendtlspolicies/status
+      - gatewayclasses/status
+      - gateways/status
+      - grpcroutes/status
+      - httproutes/status
+      - tcproutes/status
+      - tlsroutes/status
+    verbs:
+      - update
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/rbac/clusterrolebinding.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/rbac/clusterrolebinding.yaml
@@ -0,0 +1,19 @@
+---
+# Source: traefik/templates/rbac/clusterrolebinding.yaml
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: traefik-traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: traefik-traefik
+subjects:
+  - kind: ServiceAccount
+    name: traefik
+    namespace: traefik
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/rbac/serviceaccount.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/rbac/serviceaccount.yaml
@@ -0,0 +1,14 @@
+---
+# Source: traefik/templates/rbac/serviceaccount.yaml
+kind: ServiceAccount
+apiVersion: v1
+metadata:
+  name: traefik
+  namespace: traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+automountServiceAccountToken: false
--- a/internal/setup/cluster-services/traefik/kustomize.template/templates/service.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/templates/service.yaml
@@ -0,0 +1,27 @@
+---
+# Source: traefik/templates/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: traefik
+  namespace: traefik
+  labels:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+    helm.sh/chart: traefik-36.1.0
+    app.kubernetes.io/managed-by: Helm
+  annotations:
+spec:
+  type: LoadBalancer
+  selector:
+    app.kubernetes.io/name: traefik
+    app.kubernetes.io/instance: traefik-traefik
+  ports:
+  - port: 80
+    name: web
+    targetPort: web
+    protocol: TCP
+  - port: 443
+    name: websecure
+    targetPort: websecure
+    protocol: TCP
--- a/internal/setup/cluster-services/traefik/kustomize.template/traefik-service.yaml
+++ b/internal/setup/cluster-services/traefik/kustomize.template/traefik-service.yaml
@@ -0,0 +1,28 @@
+---
+# Traefik service configuration with static LoadBalancer IP
+apiVersion: v1
+kind: Service
+metadata:
+  name: traefik
+  namespace: kube-system
+  annotations:
+    # Get a stable IP from MetalLB
+    metallb.universe.tf/address-pool: production
+    metallb.universe.tf/allow-shared-ip: traefik-lb
+  labels:
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
+spec:
+  type: LoadBalancer
+  loadBalancerIP: {{ .cluster.loadBalancerIp }}
+  selector:
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
+  ports:
+    - name: web
+      port: 80
+      targetPort: web
+    - name: websecure
+      port: 443
+      targetPort: websecure
+  externalTrafficPolicy: Local
--- a/internal/setup/cluster-services/traefik/wild-manifest.yaml
+++ b/internal/setup/cluster-services/traefik/wild-manifest.yaml
@@ -0,0 +1,10 @@
+name: traefik
+description: Cloud-native reverse proxy and ingress controller
+namespace: traefik
+category: infrastructure
+
+dependencies:
+  - metallb
+
+configReferences:
+  - cluster.loadBalancerIp
--- a/internal/setup/cluster-services/utils/README.md
+++ b/internal/setup/cluster-services/utils/README.md
--- a/internal/setup/cluster-services/utils/install.sh
+++ b/internal/setup/cluster-services/utils/install.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+set -o pipefail
+
+# Ensure WILD_INSTANCE is set
+if [ -z "${WILD_INSTANCE}" ]; then
+    echo "❌ ERROR: WILD_INSTANCE is not set"
+    exit 1
+fi
+
+# Ensure WILD_CENTRAL_DATA is set
+if [ -z "${WILD_CENTRAL_DATA}" ]; then
+    echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
+    exit 1
+fi
+
+# Ensure KUBECONFIG is set
+if [ -z "${KUBECONFIG}" ]; then
+    echo "❌ ERROR: KUBECONFIG is not set"
+    exit 1
+fi
+
+INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
+CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
+UTILS_DIR="${CLUSTER_SETUP_DIR}/utils"
+
+echo "🔧 === Setting up Cluster Utilities ==="
+echo ""
+
+# Templates should already be compiled
+echo "📦 Using pre-compiled utils templates..."
+if [ ! -d "${UTILS_DIR}/kustomize" ]; then
+    echo "❌ ERROR: Compiled templates not found at ${UTILS_DIR}/kustomize"
+    echo "Templates should be compiled before deployment."
+    exit 1
+fi
+
+echo "🚀 Applying utility manifests..."
+kubectl apply -f ${UTILS_DIR}/kustomize/
+
+echo ""
+echo "✅ Cluster utilities installed successfully"
+echo ""
+echo "💡 Utility resources have been deployed to the cluster"
--- a/internal/setup/cluster-services/utils/kustomize.template/netdebug.yaml
+++ b/internal/setup/cluster-services/utils/kustomize.template/netdebug.yaml
@@ -0,0 +1,71 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: debug
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: netdebug
+  namespace: debug
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: netdebug
+subjects:
+- kind: ServiceAccount
+  name: netdebug
+  namespace: debug
+roleRef:
+  kind: ClusterRole
+  name: cluster-admin
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: netdebug
+  namespace: debug
+  labels:
+    app: netdebug
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: netdebug
+  template:
+    metadata:
+      labels:
+        app: netdebug
+    spec:
+      serviceAccountName: netdebug
+      containers:
+      - name: netdebug
+        image: nicolaka/netshoot:latest
+        command: ["/bin/bash"]
+        args: ["-c", "while true; do sleep 3600; done"]
+        resources:
+          limits:
+            cpu: 200m
+            memory: 256Mi
+          requests:
+            cpu: 100m
+            memory: 128Mi
+        securityContext:
+          privileged: true
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: netdebug
+  namespace: debug
+spec:
+  selector:
+    app: netdebug
+  ports:
+  - port: 22
+    targetPort: 22
+    name: ssh
+  type: ClusterIP
--- a/internal/setup/cluster-services/utils/wild-manifest.yaml
+++ b/internal/setup/cluster-services/utils/wild-manifest.yaml
@@ -0,0 +1,4 @@
+name: utils
+description: Utility tools and scripts for cluster administration
+namespace: utils-system
+category: infrastructure
--- a/internal/setup/dnsmasq/.gitignore
+++ b/internal/setup/dnsmasq/.gitignore
@@ -0,0 +1 @@
+setup-bundle/
--- a/Show More
+++ b/Show More