Moves setup files into embedded package.

This commit is contained in:
2025-10-11 22:06:39 +00:00
parent 92032202f4
commit 89c6a7aa80
112 changed files with 337 additions and 0 deletions

15
internal/setup/README.md Normal file
View File

@@ -0,0 +1,15 @@
# Setup instructions
Install dependencies:
Follow the instructions to [set up a dnsmasq machine](./dnsmasq/README.md).
Follow the instructions to [set up cluster nodes](./cluster-nodes/README.md).
Follow the instruction to set up [cluster services](./cluster-services/README.md).
Now make sure everything works:
```bash
wild-health
```

View File

@@ -0,0 +1,80 @@
#!/bin/bash
# Talos cluster initialization script
# This script performs one-time cluster setup: generates secrets, base configs, and sets up talosctl
set -euo pipefail
# Check if WC_HOME is set
if [ -z "${WC_HOME:-}" ]; then
echo "Error: WC_HOME environment variable not set. Run \`source ./env.sh\`."
exit 1
fi
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
# Get cluster configuration from config.yaml
CLUSTER_NAME=$(wild-config cluster.name)
VIP=$(wild-config cluster.nodes.control.vip)
TALOS_VERSION=$(wild-config cluster.nodes.talos.version)
echo "Initializing Talos cluster: $CLUSTER_NAME"
echo "VIP: $VIP"
echo "Talos version: $TALOS_VERSION"
# Create directories
mkdir -p generated final patch
# Check if cluster secrets already exist
if [ -f "generated/secrets.yaml" ]; then
echo ""
echo "⚠️ Cluster secrets already exist!"
echo "This will regenerate ALL cluster certificates and invalidate existing nodes."
echo ""
read -p "Do you want to continue? (y/N): " -r
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Cancelled."
exit 0
fi
echo ""
fi
# Generate fresh cluster secrets
echo "Generating cluster secrets..."
cd generated
talosctl gen secrets -o secrets.yaml --force
echo "Generating base machine configs..."
talosctl gen config --with-secrets secrets.yaml "$CLUSTER_NAME" "https://$VIP:6443" --force
cd ..
# Setup talosctl context
echo "Setting up talosctl context..."
# Remove existing context if it exists
talosctl config context "$CLUSTER_NAME" --remove 2>/dev/null || true
# Merge new configuration
talosctl config merge ./generated/talosconfig
talosctl config endpoint "$VIP"
echo ""
echo "✅ Cluster initialization complete!"
echo ""
echo "Cluster details:"
echo " - Name: $CLUSTER_NAME"
echo " - VIP: $VIP"
echo " - Secrets: generated/secrets.yaml"
echo " - Base configs: generated/controlplane.yaml, generated/worker.yaml"
echo ""
echo "Talosctl context configured:"
talosctl config info
echo ""
echo "Next steps:"
echo "1. Register nodes with hardware detection:"
echo " ./detect-node-hardware.sh <maintenance-ip> <node-number>"
echo ""
echo "2. Generate machine configurations:"
echo " ./generate-machine-configs.sh"
echo ""
echo "3. Apply configurations to nodes"

View File

@@ -0,0 +1,23 @@
machine:
install:
disk: {{ index .cluster.nodes.active "{{NODE_NAME}}" "disk" }}
image: factory.talos.dev/metal-installer/{{SCHEMATIC_ID}}:{{VERSION}}
network:
hostname: "{{NODE_NAME}}"
interfaces:
- interface: {{ index .cluster.nodes.active "{{NODE_NAME}}" "interface" }}
dhcp: false
addresses:
- "{{NODE_IP}}/24"
routes:
- network: 0.0.0.0/0
gateway: {{ .cloud.router.ip }}
vip:
ip: {{ .cluster.nodes.control.vip }}
# cluster:
# discovery:
# enabled: true
# registries:
# service:
# disabled: true
# allowSchedulingOnControlPlanes: true

View File

@@ -0,0 +1,23 @@
machine:
install:
disk: {{ index .cluster.nodes.active "{{NODE_NAME}}" "disk" }}
image: factory.talos.dev/metal-installer/{{ .cluster.nodes.talos.schematicId}}:{{ .cluster.nodes.talos.version}}
network:
hostname: "{{NODE_NAME}}"
interfaces:
- interface: {{ index .cluster.nodes.active "{{NODE_NAME}}" "interface" }}
dhcp: true
addresses:
- "{{NODE_IP}}/24"
routes:
- network: 0.0.0.0/0
gateway: {{ .cloud.router.ip }}
kubelet:
extraMounts:
- destination: /var/lib/longhorn
type: bind
source: /var/lib/longhorn
options:
- bind
- rshared
- rw

View File

@@ -0,0 +1,63 @@
# Talos Version to Schematic ID Mappings
#
# This file contains mappings of Talos versions to their corresponding
# default schematic IDs for wild-cloud deployments.
#
# Schematic IDs are generated from factory.talos.dev and include
# common system extensions needed for typical hardware.
#
# To add new versions:
# 1. Go to https://factory.talos.dev/
# 2. Select the system extensions you need
# 3. Generate the schematic
# 4. Add the version and schematic ID below
# Format: Each schematic ID is the primary key with version and definition nested
"434a0300db532066f1098e05ac068159371d00f0aba0a3103a0e826e83825c82":
schematic:
customization:
systemExtensions:
officialExtensions:
- siderolabs/gvisor
- siderolabs/intel-ucode
- siderolabs/iscsi-tools
- siderolabs/util-linux-tools
"f309e674d9ad94655e2cf8a43ea1432475c717cd1885f596bd7ec852b900bc5b":
schematic:
customization:
systemExtensions:
officialExtensions:
- siderolabs/gvisor
- siderolabs/intel-ucode
- siderolabs/iscsi-tools
- siderolabs/nvidia-container-toolkit-lts
- siderolabs/nvidia-container-toolkit-production
- siderolabs/nvidia-fabricmanager-lts
- siderolabs/nvidia-fabricmanager-production
- siderolabs/nvidia-open-gpu-kernel-modules-lts
- siderolabs/nvidia-open-gpu-kernel-modules-production
- siderolabs/util-linux-tools"
"56774e0894c8a3a3a9834a2aea65f24163cacf9506abbcbdc3ba135eaca4953f":
schematic:
customization:
systemExtensions:
officialExtensions:
- siderolabs/gvisor
- siderolabs/intel-ucode
- siderolabs/iscsi-tools
- siderolabs/nvidia-container-toolkit-production
- siderolabs/nvidia-fabricmanager-production
- siderolabs/nvidia-open-gpu-kernel-modules-production
- siderolabs/util-linux-tools
"9ac1424dbdf4b964154a36780dbf2215bf17d2752cd0847fa3b81d7da761457f":
schematic:
customization:
systemExtensions:
officialExtensions:
- siderolabs/gvisor
- siderolabs/intel-ucode
- siderolabs/iscsi-tools
- siderolabs/nonfree-kmod-nvidia-production
- siderolabs/nvidia-container-toolkit-production
- siderolabs/nvidia-fabricmanager-production
- siderolabs/util-linux-tools

View File

@@ -0,0 +1,102 @@
# Wild Cloud Cluster Services
Creates a fully functional personal cloud infrastructure on a bare metal Kubernetes cluster that provides:
1. **External access** to services via configured domain names (using ${DOMAIN})
2. **Internal-only access** to admin interfaces (via internal.${DOMAIN} subdomains)
3. **Secure traffic routing** with automatic TLS
4. **Reliable networking** with proper load balancing
## Service Management
Wild Cloud uses a streamlined per-service setup approach:
**Primary Command**: `wild-service-setup <service> [options]`
- **Default**: Configure and deploy service using existing templates
- **`--fetch`**: Fetch fresh templates before setup (for updates)
- **`--no-deploy`**: Configure only, skip deployment (for planning)
**Master Orchestrator**: `wild-setup-services`
- Sets up all services in proper dependency order
- Each service validates its prerequisites before deployment
- Fail-fast approach with clear recovery instructions
## Architecture
```
Internet → External DNS → MetalLB LoadBalancer → Traefik → Kubernetes Services
Internal DNS
Internal Network
```
## Key Components
- **[MetalLB](metallb/README.md)** - Provides load balancing for bare metal clusters
- **[Traefik](traefik/README.md)** - Handles ingress traffic, TLS termination, and routing
- **[cert-manager](cert-manager/README.md)** - Manages TLS certificates
- **[CoreDNS](coredns/README.md)** - Provides DNS resolution for services
- **[ExternalDNS](externaldns/README.md)** - Automatic DNS record management
- **[Longhorn](longhorn/README.md)** - Distributed storage system for persistent volumes
- **[NFS](nfs/README.md)** - Network file system for shared media storage (optional)
- **[Kubernetes Dashboard](kubernetes-dashboard/README.md)** - Web UI for cluster management (accessible via https://dashboard.internal.${DOMAIN})
- **[Docker Registry](docker-registry/README.md)** - Private container registry for custom images
- **[Utils](utils/README.md)** - Cluster utilities and debugging tools
## Common Usage Patterns
### Complete Infrastructure Setup
```bash
# All services with fresh templates (recommended for first-time setup)
wild-setup-services --fetch
# All services using existing templates (fastest)
wild-setup-services
# Configure all services but don't deploy (for planning)
wild-setup-services --no-deploy
```
### Individual Service Management
```bash
# Most common - reconfigure and deploy existing service
wild-service-setup cert-manager
# Get fresh templates and deploy (for updates)
wild-service-setup cert-manager --fetch
# Configure only, don't deploy (for planning)
wild-service-setup cert-manager --no-deploy
# Fresh templates + configure + deploy
wild-service-setup cert-manager --fetch
```
### Service Dependencies
Services are automatically deployed in dependency order:
1. **metallb** → Load balancing foundation
2. **traefik** → Ingress (requires metallb)
3. **cert-manager** → TLS certificates (requires traefik)
4. **externaldns** → DNS automation (requires cert-manager)
5. **kubernetes-dashboard** → Admin UI (requires cert-manager)
Each service validates its dependencies before deployment.
## Idempotent Design
All setup is designed to be idempotent and reliable:
- **Atomic Operations**: Each service handles its complete lifecycle
- **Dependency Validation**: Services check prerequisites before deployment
- **Error Recovery**: Failed services can be individually fixed and re-run
- **Safe Retries**: Operations can be repeated without harm
- **Incremental Updates**: Configuration changes applied cleanly
Example recovery from cert-manager failure:
```bash
# Fix the issue, then resume
wild-service-setup cert-manager --fetch
# Continue with remaining services
wild-service-setup externaldns --fetch
```

View File

@@ -0,0 +1,260 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
echo "🔧 === Setting up cert-manager ==="
echo ""
#######################
# Dependencies
#######################
# Check Traefik dependency
echo "🔍 Verifying Traefik is ready (required for cert-manager)..."
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
echo "⚠️ Traefik not ready, but continuing with cert-manager installation"
echo "💡 Note: cert-manager may not work properly without Traefik"
}
if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${CERT_MANAGER_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
# Note: DNS validation and Cloudflare token setup moved to configuration phase
# The configuration should be set via: wild config set cluster.certManager.cloudflare.*
########################
# Kubernetes components
########################
echo "📦 Installing cert-manager components..."
# Using stable URL for cert-manager installation
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
# Wait for cert-manager to be ready
echo "⏳ Waiting for cert-manager to be ready..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
# Create Cloudflare API token secret
# Read token from Wild Central secrets file
echo "🔐 Creating Cloudflare API token secret..."
SECRETS_FILE="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}/secrets.yaml"
CLOUDFLARE_API_TOKEN=$(yq '.cloudflare.token' "$SECRETS_FILE" 2>/dev/null)
CLOUDFLARE_API_TOKEN=$(echo "$CLOUDFLARE_API_TOKEN")
if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
echo "❌ ERROR: Cloudflare API token not found"
echo "💡 Please set: wild secret set cloudflare.token YOUR_TOKEN"
exit 1
fi
kubectl create secret generic cloudflare-api-token \
--namespace cert-manager \
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
--dry-run=client -o yaml | kubectl apply -f -
# Ensure webhook is fully operational
echo "🔍 Verifying cert-manager webhook is fully operational..."
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
echo "⏳ Waiting for cert-manager webhook to register..."
sleep 5
done
# Configure cert-manager to use external DNS for challenge verification
echo "🌐 Configuring cert-manager to use external DNS servers..."
kubectl patch deployment cert-manager -n cert-manager --patch '
spec:
template:
spec:
dnsPolicy: None
dnsConfig:
nameservers:
- "1.1.1.1"
- "8.8.8.8"
searches:
- cert-manager.svc.cluster.local
- svc.cluster.local
- cluster.local
options:
- name: ndots
value: "5"'
# Wait for cert-manager to restart with new DNS config
echo "⏳ Waiting for cert-manager to restart with new DNS configuration..."
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
########################
# Create issuers and certificates
########################
# Apply Let's Encrypt issuers and certificates using kustomize
echo "🚀 Creating Let's Encrypt issuers and certificates..."
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
# Wait for issuers to be ready
echo "⏳ Waiting for Let's Encrypt issuers to be ready..."
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || echo "⚠️ Production issuer not ready, proceeding anyway..."
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || echo "⚠️ Staging issuer not ready, proceeding anyway..."
# Give cert-manager a moment to process the certificates
sleep 5
######################################
# Fix stuck certificates and cleanup
######################################
needs_restart=false
# STEP 1: Fix certificates stuck with 404 errors
echo "🔍 Checking for certificates with failed issuance attempts..."
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
if [ -n "$stuck_certs" ]; then
echo "⚠️ Found certificates stuck with non-existent orders, recreating them..."
echo "$stuck_certs" | while read ns name; do
echo "🔄 Recreating certificate $ns/$name..."
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
kubectl delete certificate "$name" -n "$ns"
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
done
needs_restart=true
sleep 5
else
echo "✅ No certificates stuck with failed orders"
fi
# STEP 2: Clean up orphaned orders
echo "🔍 Checking for orphaned ACME orders..."
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
sort -u || true)
if [ -n "$orphaned_orders" ]; then
echo "⚠️ Found orphaned ACME orders from logs"
for order in $orphaned_orders; do
echo "🗑️ Deleting orphaned order: $order"
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
if [ -n "$orders_found" ]; then
echo "$orders_found" | while read ns name rest; do
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
done
fi
done
needs_restart=true
else
echo "✅ No orphaned orders found in logs"
fi
# STEP 2.5: Check for Cloudflare DNS cleanup errors
echo "🔍 Checking for Cloudflare DNS cleanup errors..."
cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
if [ "$cloudflare_errors" -gt "0" ]; then
echo "⚠️ Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
echo "💡 Deleting stuck challenges and orders to allow fresh start"
# Delete all challenges and orders in cert-manager namespace
kubectl delete challenges --all -n cert-manager 2>/dev/null || true
kubectl delete orders --all -n cert-manager 2>/dev/null || true
needs_restart=true
else
echo "✅ No Cloudflare DNS cleanup errors"
fi
# STEP 3: Single restart if anything needs cleaning
if [ "$needs_restart" = true ]; then
echo "🔄 Restarting cert-manager to clear internal state..."
kubectl rollout restart deployment cert-manager -n cert-manager
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
echo "⏳ Waiting for cert-manager to recreate fresh challenges..."
sleep 15
else
echo "✅ No restart needed - cert-manager state is clean"
fi
#########################
# Final checks
#########################
# Wait for the certificates to be issued with progress feedback
echo "⏳ Waiting for wildcard certificates to be ready (this may take several minutes)..."
# Function to wait for certificate with progress output
wait_for_cert() {
local cert_name="$1"
local timeout=300
local elapsed=0
echo " 📜 Checking $cert_name..."
while [ $elapsed -lt $timeout ]; do
if kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q "True"; then
echo "$cert_name is ready"
return 0
fi
# Show progress every 30 seconds
if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then
local status=$(kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "Waiting...")
echo " ⏳ Still waiting for $cert_name... ($elapsed/${timeout}s) - $status"
fi
sleep 5
elapsed=$((elapsed + 5))
done
echo " ⚠️ Timeout waiting for $cert_name (will continue anyway)"
return 1
}
wait_for_cert "wildcard-internal-wild-cloud"
wait_for_cert "wildcard-wild-cloud"
# Final health check
echo "🔍 Performing final cert-manager health check..."
failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
if [ "$failed_certs" -gt 0 ]; then
echo "⚠️ Found $failed_certs certificates not in Ready state"
echo "💡 Check certificate status with: kubectl get certificates --all-namespaces"
echo "💡 Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
else
echo "✅ All certificates are in Ready state"
fi
echo ""
echo "✅ cert-manager setup complete!"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get certificates --all-namespaces"
echo " kubectl get clusterissuers"

View File

@@ -0,0 +1,19 @@
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: wildcard-internal-wild-cloud
namespace: cert-manager
spec:
secretName: wildcard-internal-wild-cloud-tls
dnsNames:
- "*.{{ .cloud.internalDomain }}"
- "{{ .cloud.internalDomain }}"
issuerRef:
name: letsencrypt-prod
kind: ClusterIssuer
duration: 2160h # 90 days
renewBefore: 360h # 15 days
privateKey:
algorithm: RSA
size: 2048

View File

@@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- letsencrypt-staging-dns01.yaml
- letsencrypt-prod-dns01.yaml
- internal-wildcard-certificate.yaml
- wildcard-certificate.yaml
# Note: cert-manager.yaml contains the main installation manifests
# but is applied separately via URL in the install script

View File

@@ -0,0 +1,25 @@
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: {{ .operator.email }}
privateKeySecretRef:
name: letsencrypt-prod
server: https://acme-v02.api.letsencrypt.org/directory
solvers:
# DNS-01 solver for wildcard certificates
- dns01:
cloudflare:
apiTokenSecretRef:
name: cloudflare-api-token
key: api-token
selector:
dnsZones:
- "{{ .cluster.certManager.cloudflare.domain }}"
# Keep the HTTP-01 solver for non-wildcard certificates
- http01:
ingress:
class: traefik

View File

@@ -0,0 +1,25 @@
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-staging
spec:
acme:
email: {{ .operator.email }}
privateKeySecretRef:
name: letsencrypt-staging
server: https://acme-staging-v02.api.letsencrypt.org/directory
solvers:
# DNS-01 solver for wildcard certificates
- dns01:
cloudflare:
apiTokenSecretRef:
name: cloudflare-api-token
key: api-token
selector:
dnsZones:
- "{{ .cluster.certManager.cloudflare.domain }}"
# Keep the HTTP-01 solver for non-wildcard certificates
- http01:
ingress:
class: traefik

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: cert-manager

View File

@@ -0,0 +1,19 @@
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: wildcard-wild-cloud
namespace: cert-manager
spec:
secretName: wildcard-wild-cloud-tls
dnsNames:
- "*.{{ .cloud.domain }}"
- "{{ .cloud.domain }}"
issuerRef:
name: letsencrypt-prod
kind: ClusterIssuer
duration: 2160h # 90 days
renewBefore: 360h # 15 days
privateKey:
algorithm: RSA
size: 2048

View File

@@ -0,0 +1,25 @@
name: cert-manager
description: X.509 certificate management for Kubernetes
namespace: cert-manager
category: infrastructure
dependencies:
- traefik
configReferences:
- cloud.domain
- cloud.baseDomain
- cloud.internalDomain
- operator.email
serviceConfig:
cloudflareDomain:
path: cluster.certManager.cloudflare.domain
prompt: "Enter Cloudflare domain"
default: "{{ .cloud.baseDomain }}"
type: string
cloudflareZoneID:
path: cluster.certManager.cloudflare.zoneID
prompt: "Enter Cloudflare zone ID"
default: ""
type: string

View File

@@ -0,0 +1,112 @@
#!/bin/bash
# Common functions for Wild Central service installation scripts
# TODO: We should use this. :P
# Ensure required environment variables are set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE environment variable is not set"
exit 1
fi
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA environment variable is not set"
exit 1
fi
# Get the instance directory path
get_instance_dir() {
echo "${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
}
# Get the secrets file path
get_secrets_file() {
echo "$(get_instance_dir)/secrets.yaml"
}
# Get the config file path
get_config_file() {
echo "$(get_instance_dir)/config.yaml"
}
# Get a secret value from the secrets file
# Usage: get_secret "path.to.secret"
get_secret() {
local path="$1"
local secrets_file="$(get_secrets_file)"
if [ ! -f "$secrets_file" ]; then
echo ""
return 1
fi
local value=$(yq ".$path" "$secrets_file" 2>/dev/null)
# Remove quotes and return empty string if null
value=$(echo "$value" | tr -d '"')
if [ "$value" = "null" ]; then
echo ""
return 1
fi
echo "$value"
}
# Get a config value from the config file
# Usage: get_config "path.to.config"
get_config() {
local path="$1"
local config_file="$(get_config_file)"
if [ ! -f "$config_file" ]; then
echo ""
return 1
fi
local value=$(yq ".$path" "$config_file" 2>/dev/null)
# Remove quotes and return empty string if null
value=$(echo "$value" | tr -d '"')
if [ "$value" = "null" ]; then
echo ""
return 1
fi
echo "$value"
}
# Check if a secret exists and is not empty
# Usage: require_secret "path.to.secret" "Friendly Name" "wild secret set command"
require_secret() {
local path="$1"
local name="$2"
local set_command="$3"
local value=$(get_secret "$path")
if [ -z "$value" ]; then
echo "❌ ERROR: $name not found"
echo "💡 Please set: $set_command"
exit 1
fi
echo "$value"
}
# Check if a config value exists and is not empty
# Usage: require_config "path.to.config" "Friendly Name" "wild config set command"
require_config() {
local path="$1"
local name="$2"
local set_command="$3"
local value=$(get_config "$path")
if [ -z "$value" ]; then
echo "❌ ERROR: $name not found"
echo "💡 Please set: $set_command"
exit 1
fi
echo "$value"
}

View File

@@ -0,0 +1,45 @@
# CoreDNS
- https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/
- https://github.com/kubernetes/dns/blob/master/docs/specification.md
- https://coredns.io/
CoreDNS has the `kubernetes` plugin, so it returns all k8s service endpoints in well-known format.
All services and pods are registered in CoreDNS.
- <service-name>.<namespace>.svc.cluster.local
- <service-name>.<namespace>
- <service-name> (if in the same namespace)
- <pod-ipv4-address>.<namespace>.pod.cluster.local
- <pod-ipv4-address>.<service-name>.<namespace>.svc.cluster.local
Any query for a resource in the `internal.$DOMAIN` domain will be given the IP of the Traefik proxy. We expose the CoreDNS server in the LAN via MetalLB just for this capability.
## Default CoreDNS Configuration
This is the default CoreDNS configuration, for reference:
```txt
.:53 {
errors
health { lameduck 5s }
ready
log . { class error }
prometheus :9153
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
forward . /etc/resolv.conf { max_concurrent 1000 }
cache 30 {
disable success cluster.local
disable denial cluster.local
}
loop
reload
loadbalance
}
```

View File

@@ -0,0 +1,57 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
COREDNS_DIR="${CLUSTER_SETUP_DIR}/coredns"
echo "🔧 === Setting up CoreDNS ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled CoreDNS templates..."
if [ ! -d "${COREDNS_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${COREDNS_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
# Apply the custom DNS override
# TODO: Is this needed now that we are no longer on k3s?
echo "🚀 Applying CoreDNS custom override configuration..."
kubectl apply -f "${COREDNS_DIR}/kustomize/coredns-custom-config.yaml"
echo "🔄 Restarting CoreDNS pods to apply changes..."
kubectl rollout restart deployment/coredns -n kube-system
echo "⏳ Waiting for CoreDNS rollout to complete..."
kubectl rollout status deployment/coredns -n kube-system
echo ""
echo "✅ CoreDNS configured successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n kube-system -l k8s-app=kube-dns"
echo " kubectl get svc -n kube-system coredns"
echo " kubectl describe svc -n kube-system coredns"
echo ""
echo "📋 To view CoreDNS logs:"
echo " kubectl logs -n kube-system -l k8s-app=kube-dns -f"

View File

@@ -0,0 +1,28 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: coredns-custom
namespace: kube-system
data:
# Custom server block for internal domains. All internal domains should
# resolve to the cluster proxy.
internal.server: |
{{ .cloud.internalDomain }} {
errors
cache 30
reload
template IN A {
match (.*)\.{{ .cloud.internalDomain | strings.ReplaceAll "." "\\." }}\.
answer "{{`{{ .Name }}`}} 60 IN A {{ .cluster.loadBalancerIp }}"
}
template IN AAAA {
match (.*)\.{{ .cloud.internalDomain | strings.ReplaceAll "." "\\." }}\.
rcode NXDOMAIN
}
}
# Custom override to set external resolvers.
external.override: |
forward . {{ .cloud.dns.externalResolver }} {
max_concurrent 1000
}

View File

@@ -0,0 +1,15 @@
name: coredns
description: DNS server for internal cluster DNS resolution
namespace: kube-system
category: infrastructure
configReferences:
- cloud.internalDomain
- cluster.loadBalancerIp
serviceConfig:
externalResolver:
path: cloud.dns.externalResolver
prompt: "Enter external DNS resolver"
default: "8.8.8.8"
type: string

View File

@@ -0,0 +1,53 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
DOCKER_REGISTRY_DIR="${CLUSTER_SETUP_DIR}/docker-registry"
echo "🔧 === Setting up Docker Registry ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled Docker Registry templates..."
if [ ! -d "${DOCKER_REGISTRY_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${DOCKER_REGISTRY_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying Docker Registry..."
kubectl apply -k "${DOCKER_REGISTRY_DIR}/kustomize"
echo "⏳ Waiting for Docker Registry to be ready..."
kubectl wait --for=condition=available --timeout=300s deployment/docker-registry -n docker-registry
echo ""
echo "✅ Docker Registry installed successfully"
echo ""
echo "📊 Deployment status:"
kubectl get pods -n docker-registry
kubectl get services -n docker-registry
echo ""
echo "💡 To use the registry:"
echo " docker tag myimage registry.local/myimage"
echo " docker push registry.local/myimage"

View File

@@ -0,0 +1,36 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: docker-registry
labels:
app: docker-registry
spec:
replicas: 1
selector:
matchLabels:
app: docker-registry
strategy:
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app: docker-registry
spec:
containers:
- image: registry:3.0.0
name: docker-registry
ports:
- containerPort: 5000
protocol: TCP
volumeMounts:
- mountPath: /var/lib/registry
name: docker-registry-storage
readOnly: false
volumes:
- name: docker-registry-storage
persistentVolumeClaim:
claimName: docker-registry-pvc

View File

@@ -0,0 +1,20 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: docker-registry
spec:
rules:
- host: {{ .cloud.dockerRegistryHost }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: docker-registry
port:
number: 5000
tls:
- hosts:
- {{ .cloud.dockerRegistryHost }}
secretName: wildcard-internal-wild-cloud-tls

View File

@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: docker-registry
labels:
- includeSelectors: true
pairs:
app: docker-registry
managedBy: wild-cloud
resources:
- deployment.yaml
- ingress.yaml
- service.yaml
- namespace.yaml
- pvc.yaml

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: docker-registry

View File

@@ -0,0 +1,12 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: docker-registry-pvc
spec:
storageClassName: longhorn
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: {{ .cluster.dockerRegistry.storage }}

View File

@@ -0,0 +1,13 @@
---
apiVersion: v1
kind: Service
metadata:
name: docker-registry
labels:
app: docker-registry
spec:
ports:
- port: 5000
targetPort: 5000
selector:
app: docker-registry

View File

@@ -0,0 +1,20 @@
name: docker-registry
description: Private Docker image registry for cluster
namespace: docker-registry
category: infrastructure
dependencies:
- traefik
- cert-manager
serviceConfig:
registryHost:
path: cloud.dockerRegistryHost
prompt: "Enter Docker Registry hostname"
default: "registry.{{ .cloud.internalDomain }}"
type: string
storage:
path: cluster.dockerRegistry.storage
prompt: "Enter Docker Registry storage size"
default: "100Gi"
type: string

View File

@@ -0,0 +1,14 @@
# External DNS
See: https://github.com/kubernetes-sigs/external-dns
ExternalDNS allows you to keep selected zones (via --domain-filter) synchronized with Ingresses and Services of type=LoadBalancer and nodes in various DNS providers.
Currently, we are only configured to use CloudFlare.
Docs: https://github.com/kubernetes-sigs/external-dns/blob/master/docs/tutorials/cloudflare.md
Any Ingress that has metatdata.annotions with
external-dns.alpha.kubernetes.io/hostname: `<something>.${DOMAIN}`
will have Cloudflare records created by External DNS.

View File

@@ -0,0 +1,79 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
EXTERNALDNS_DIR="${CLUSTER_SETUP_DIR}/externaldns"
echo "🌐 === Setting up ExternalDNS ==="
echo ""
# Check cert-manager dependency
echo "🔍 Verifying cert-manager is ready (required for ExternalDNS)..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=60s 2>/dev/null && \
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=60s 2>/dev/null || {
echo "⚠️ cert-manager not ready, but continuing with ExternalDNS installation"
echo "💡 Note: ExternalDNS may not work properly without cert-manager"
}
# Templates should already be compiled
echo "📦 Using pre-compiled ExternalDNS templates..."
if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${EXTERNALDNS_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
# Apply ExternalDNS manifests using kustomize
echo "🚀 Deploying ExternalDNS..."
kubectl apply -k ${EXTERNALDNS_DIR}/kustomize
# Setup Cloudflare API token secret
echo "🔐 Creating Cloudflare API token secret..."
SECRETS_FILE="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}/secrets.yaml"
CLOUDFLARE_API_TOKEN=$(yq '.cloudflare.token' "$SECRETS_FILE" 2>/dev/null | tr -d '"')
if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
echo "❌ ERROR: Cloudflare API token not found."
echo "💡 Please set: wild secret set cloudflare.token YOUR_TOKEN"
exit 1
fi
kubectl create secret generic cloudflare-api-token \
--namespace externaldns \
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
--dry-run=client -o yaml | kubectl apply -f -
# Wait for ExternalDNS to be ready
echo "⏳ Waiting for Cloudflare ExternalDNS to be ready..."
kubectl rollout status deployment/external-dns -n externaldns --timeout=60s
# echo "⏳ Waiting for CoreDNS ExternalDNS to be ready..."
# kubectl rollout status deployment/external-dns-coredns -n externaldns --timeout=60s
echo ""
echo "✅ ExternalDNS installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n externaldns"
echo " kubectl logs -n externaldns -l app=external-dns -f"
echo " kubectl logs -n externaldns -l app=external-dns-coredns -f"
echo ""

View File

@@ -0,0 +1,39 @@
---
# CloudFlare provider for ExternalDNS
apiVersion: apps/v1
kind: Deployment
metadata:
name: external-dns
namespace: externaldns
spec:
selector:
matchLabels:
app: external-dns
strategy:
type: Recreate
template:
metadata:
labels:
app: external-dns
spec:
serviceAccountName: external-dns
containers:
- name: external-dns
image: registry.k8s.io/external-dns/external-dns:v0.13.4
args:
- --source=service
- --source=ingress
- --txt-owner-id={{ .cluster.externalDns.ownerId }}
- --provider=cloudflare
- --domain-filter=payne.io
#- --exclude-domains=internal.${DOMAIN}
- --cloudflare-dns-records-per-page=5000
- --publish-internal-services
- --no-cloudflare-proxied
- --log-level=debug
env:
- name: CF_API_TOKEN
valueFrom:
secretKeyRef:
name: cloudflare-api-token
key: api-token

View File

@@ -0,0 +1,35 @@
---
# Common RBAC resources for all ExternalDNS deployments
apiVersion: v1
kind: ServiceAccount
metadata:
name: external-dns
namespace: externaldns
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: external-dns
rules:
- apiGroups: [""]
resources: ["services", "endpoints", "pods"]
verbs: ["get", "watch", "list"]
- apiGroups: ["extensions", "networking.k8s.io"]
resources: ["ingresses"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: external-dns-viewer
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: external-dns
subjects:
- kind: ServiceAccount
name: external-dns
namespace: externaldns

View File

@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- externaldns-rbac.yaml
- externaldns-cloudflare.yaml

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: externaldns

View File

@@ -0,0 +1,15 @@
name: externaldns
description: Automatically configures DNS records for services
namespace: externaldns
category: infrastructure
configReferences:
- cloud.internalDomain
- cluster.name
serviceConfig:
ownerId:
path: cluster.externalDns.ownerId
prompt: "Enter ExternalDNS owner ID (unique identifier for this cluster)"
default: "wild-cloud-{{ .cluster.name }}"
type: string

View File

@@ -0,0 +1,91 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
KUBERNETES_DASHBOARD_DIR="${CLUSTER_SETUP_DIR}/kubernetes-dashboard"
echo "🎮 === Setting up Kubernetes Dashboard ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled Dashboard templates..."
if [ ! -d "${KUBERNETES_DASHBOARD_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${KUBERNETES_DASHBOARD_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
NAMESPACE="kubernetes-dashboard"
# Apply the official dashboard installation
echo "🚀 Installing Kubernetes Dashboard core components..."
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
# Wait for cert-manager certificates to be ready
echo "🔐 Waiting for cert-manager certificates to be ready..."
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || echo "⚠️ Warning: Internal wildcard certificate not ready yet"
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || echo "⚠️ Warning: Wildcard certificate not ready yet"
# Copying cert-manager secrets to the dashboard namespace (if available)
echo "📋 Copying cert-manager secrets to dashboard namespace..."
if kubectl get secret wildcard-internal-wild-cloud-tls -n cert-manager >/dev/null 2>&1; then
kubectl get secret wildcard-internal-wild-cloud-tls -n cert-manager -o yaml | \
sed "s/namespace: cert-manager/namespace: ${NAMESPACE}/" | \
kubectl apply -f -
else
echo "⚠️ Warning: wildcard-internal-wild-cloud-tls secret not yet available"
fi
if kubectl get secret wildcard-wild-cloud-tls -n cert-manager >/dev/null 2>&1; then
kubectl get secret wildcard-wild-cloud-tls -n cert-manager -o yaml | \
sed "s/namespace: cert-manager/namespace: ${NAMESPACE}/" | \
kubectl apply -f -
else
echo "⚠️ Warning: wildcard-wild-cloud-tls secret not yet available"
fi
# Apply dashboard customizations using kustomize
echo "🔧 Applying dashboard customizations..."
kubectl apply -k "${KUBERNETES_DASHBOARD_DIR}/kustomize"
# Restart CoreDNS to pick up the changes
# echo "🔄 Restarting CoreDNS to pick up DNS changes..."
# kubectl delete pods -n kube-system -l k8s-app=kube-dns
# Wait for dashboard to be ready
echo "⏳ Waiting for Kubernetes Dashboard to be ready..."
kubectl rollout status deployment/kubernetes-dashboard -n $NAMESPACE --timeout=60s
echo ""
echo "✅ Kubernetes Dashboard installed successfully"
echo ""
# INTERNAL_DOMAIN should be available in environment (set from config before deployment)
if [ -n "${INTERNAL_DOMAIN}" ]; then
echo "🌐 Access the dashboard at: https://dashboard.${INTERNAL_DOMAIN}"
else
echo "💡 Access the dashboard via the configured internal domain"
fi
echo ""
echo "💡 To get the authentication token:"
echo " kubectl create token admin-user -n kubernetes-dashboard"
echo ""

View File

@@ -0,0 +1,32 @@
---
# Service Account and RBAC for Dashboard admin access
apiVersion: v1
kind: ServiceAccount
metadata:
name: dashboard-admin
namespace: kubernetes-dashboard
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: dashboard-admin
subjects:
- kind: ServiceAccount
name: dashboard-admin
namespace: kubernetes-dashboard
roleRef:
kind: ClusterRole
name: cluster-admin
apiGroup: rbac.authorization.k8s.io
---
# Token for dashboard-admin
apiVersion: v1
kind: Secret
metadata:
name: dashboard-admin-token
namespace: kubernetes-dashboard
annotations:
kubernetes.io/service-account.name: dashboard-admin
type: kubernetes.io/service-account-token

View File

@@ -0,0 +1,84 @@
---
# Internal-only middleware
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: internal-only
namespace: kubernetes-dashboard
spec:
ipWhiteList:
# Restrict to local private network ranges
sourceRange:
- 127.0.0.1/32 # localhost
- 10.0.0.0/8 # Private network
- 172.16.0.0/12 # Private network
- 192.168.0.0/16 # Private network
---
# HTTPS redirect middleware
apiVersion: traefik.io/v1alpha1
kind: Middleware
metadata:
name: dashboard-redirect-scheme
namespace: kubernetes-dashboard
spec:
redirectScheme:
scheme: https
permanent: true
---
# IngressRoute for Dashboard
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: kubernetes-dashboard-https
namespace: kubernetes-dashboard
spec:
entryPoints:
- websecure
routes:
- match: Host(`dashboard.{{ .cloud.internalDomain }}`)
kind: Rule
middlewares:
- name: internal-only
namespace: kubernetes-dashboard
services:
- name: kubernetes-dashboard
port: 443
serversTransport: dashboard-transport
tls:
secretName: wildcard-internal-wild-cloud-tls
---
# HTTP to HTTPS redirect.
# FIXME: Is this needed?
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: kubernetes-dashboard-http
namespace: kubernetes-dashboard
spec:
entryPoints:
- web
routes:
- match: Host(`dashboard.{{ .cloud.internalDomain }}`)
kind: Rule
middlewares:
- name: dashboard-redirect-scheme
namespace: kubernetes-dashboard
services:
- name: kubernetes-dashboard
port: 443
serversTransport: dashboard-transport
---
# ServersTransport for HTTPS backend with skip verify.
# FIXME: Is this needed?
apiVersion: traefik.io/v1alpha1
kind: ServersTransport
metadata:
name: dashboard-transport
namespace: kubernetes-dashboard
spec:
insecureSkipVerify: true
serverName: dashboard.{{ .cloud.internalDomain }}

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- dashboard-admin-rbac.yaml
- dashboard-kube-system.yaml

View File

@@ -0,0 +1,11 @@
name: kubernetes-dashboard
description: Web-based Kubernetes user interface
namespace: kubernetes-dashboard
category: infrastructure
dependencies:
- traefik
- cert-manager
configReferences:
- cloud.internalDomain

View File

@@ -0,0 +1,20 @@
# Longhorn Storage
See: [Longhorn Docs v 1.8.1](https://longhorn.io/docs/1.8.1/deploy/install/install-with-kubectl/)
## Installation Notes
- Manifest copied from https://raw.githubusercontent.com/longhorn/longhorn/v1.8.1/deploy/longhorn.yaml
- Using kustomize to apply custom configuration (see `kustomization.yaml`)
## Important Settings
- **Number of Replicas**: Set to 1 (default is 3) to accommodate smaller clusters
- This avoids "degraded" volumes when fewer than 3 nodes are available
- For production with 3+ nodes, consider changing back to 3 for better availability
## Common Operations
- View volumes: `kubectl get volumes.longhorn.io -n longhorn-system`
- Check volume status: `kubectl describe volumes.longhorn.io <volume-name> -n longhorn-system`
- Access Longhorn UI: Set up port-forwarding with `kubectl -n longhorn-system port-forward service/longhorn-frontend 8080:80`

View File

@@ -0,0 +1,52 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
LONGHORN_DIR="${CLUSTER_SETUP_DIR}/longhorn"
echo "🔧 === Setting up Longhorn ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled Longhorn templates..."
if [ ! -d "${LONGHORN_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${LONGHORN_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying Longhorn..."
kubectl apply -k ${LONGHORN_DIR}/kustomize/
echo "⏳ Waiting for Longhorn to be ready..."
kubectl wait --for=condition=available --timeout=300s deployment/longhorn-driver-deployer -n longhorn-system || true
echo ""
echo "✅ Longhorn installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n longhorn-system"
echo " kubectl get storageclass"
echo ""
echo "🌐 To access the Longhorn UI:"
echo " kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80"

View File

@@ -0,0 +1,21 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: longhorn-ingress
namespace: longhorn-system
spec:
rules:
- host: "longhorn.{{ .cloud.internalDomain }}"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: longhorn-frontend
port:
number: 80
tls:
- secretName: wildcard-internal-wild-cloud-tls
hosts:
- "longhorn.{{ .cloud.internalDomain }}"

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- longhorn.yaml
- ingress.yaml

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
name: longhorn
description: Cloud-native distributed block storage for Kubernetes
namespace: longhorn-system
category: infrastructure
dependencies:
- traefik

View File

@@ -0,0 +1,56 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
METALLB_DIR="${CLUSTER_SETUP_DIR}/metallb"
echo "🔧 === Setting up MetalLB ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled MetalLB templates..."
if [ ! -d "${METALLB_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${METALLB_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying MetalLB installation..."
kubectl apply -k ${METALLB_DIR}/kustomize/installation
echo "⏳ Waiting for MetalLB controller to be ready..."
kubectl wait --for=condition=Available deployment/controller -n metallb-system --timeout=60s
echo "⏳ Extra buffer for webhook initialization..."
sleep 10
echo "⚙️ Applying MetalLB configuration..."
kubectl apply -k ${METALLB_DIR}/kustomize/configuration
echo ""
echo "✅ MetalLB installed and configured successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n metallb-system"
echo " kubectl get ipaddresspools.metallb.io -n metallb-system"
echo ""
echo "🌐 MetalLB will now provide LoadBalancer IPs for your services"

View File

@@ -0,0 +1,3 @@
namespace: metallb-system
resources:
- pool.yaml

View File

@@ -0,0 +1,19 @@
---
apiVersion: metallb.io/v1beta1
kind: IPAddressPool
metadata:
name: first-pool
namespace: metallb-system
spec:
addresses:
- {{ .cluster.ipAddressPool }}
---
apiVersion: metallb.io/v1beta1
kind: L2Advertisement
metadata:
name: l2-advertisement
namespace: metallb-system
spec:
ipAddressPools:
- first-pool

View File

@@ -0,0 +1,3 @@
namespace: metallb-system
resources:
- github.com/metallb/metallb/config/native?ref=v0.15.0

View File

@@ -0,0 +1,19 @@
name: metallb
description: Bare metal load-balancer for Kubernetes
namespace: metallb-system
category: infrastructure
configReferences:
- cluster.name
serviceConfig:
ipRange:
path: cluster.ipAddressPool
prompt: "Enter IP range for MetalLB (e.g., 192.168.1.240-192.168.1.250)"
default: "192.168.1.240-192.168.1.250"
type: string
loadBalancerIp:
path: cluster.loadBalancerIp
prompt: "Enter primary load balancer IP"
default: "192.168.1.240"
type: string

View File

@@ -0,0 +1,60 @@
# NFS Setup (Optional)
The infrastructure supports optional NFS (Network File System) for shared media storage across the cluster. If your config.yaml contains the `cloud.nfs` section, the NFS server will be set up automatically.
## Host Setup
First, set up the NFS server on your chosen host.
```bash
./setup-nfs-host.sh <host> <media-path>
```
Example:
```bash
./setup-nfs-host.sh box-01 /srv/nfs
```
## Cluster Integration
Add to your `config.yaml`:
```yaml
cloud:
nfs:
host: box-01
mediaPath: /srv/nfs
storageCapacity: 250Gi # Max size for PersistentVolume
```
And now you can run the nfs cluster setup:
```bash
setup/setup-nfs-host.sh
```
## Features
- Automatic IP detection - Uses network IP even when hostname resolves to localhost
- Cluster-wide access - Any pod can mount the NFS share regardless of node placement
- Configurable capacity - Set PersistentVolume size via `NFS_STORAGE_CAPACITY`
- ReadWriteMany - Multiple pods can simultaneously access the same storage
## Usage
Applications can use NFS storage by setting `storageClassName: nfs` in their PVCs:
```yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: media-pvc
spec:
accessModes:
- ReadWriteMany
storageClassName: nfs
resources:
requests:
storage: 100Gi
```

View File

@@ -0,0 +1,255 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CONFIG_FILE="${INSTANCE_DIR}/config.yaml"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
NFS_DIR="${CLUSTER_SETUP_DIR}/nfs"
echo "💾 === Registering NFS Server with Kubernetes Cluster ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled NFS templates..."
if [ ! -d "${NFS_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${NFS_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
NFS_HOST="$(yq '.cloud.nfs.host' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
NFS_MEDIA_PATH="$(yq '.cloud.nfs.mediaPath' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
NFS_STORAGE_CAPACITY="$(yq '.cloud.nfs.storageCapacity' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
echo "📋 NFS Configuration:"
echo " Host: ${NFS_HOST}"
echo " Media path: ${NFS_MEDIA_PATH}"
echo " Storage capacity: ${NFS_STORAGE_CAPACITY}"
echo ""
# Validate required config values
if [ -z "${NFS_HOST}" ] || [ "${NFS_HOST}" = "null" ]; then
echo "❌ ERROR: cloud.nfs.host not set in config"
exit 1
fi
if [ -z "${NFS_MEDIA_PATH}" ] || [ "${NFS_MEDIA_PATH}" = "null" ]; then
echo "❌ ERROR: cloud.nfs.mediaPath not set in config"
exit 1
fi
if [ -z "${NFS_STORAGE_CAPACITY}" ] || [ "${NFS_STORAGE_CAPACITY}" = "null" ]; then
echo "❌ ERROR: cloud.nfs.storageCapacity not set in config"
exit 1
fi
# Function to resolve NFS host to IP
resolve_nfs_host() {
echo "🌐 Resolving NFS host: ${NFS_HOST}"
if [[ "${NFS_HOST}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
# NFS_HOST is already an IP address
NFS_HOST_IP="${NFS_HOST}"
echo " Host is already an IP address"
else
# Resolve hostname to IP
echo " 🔍 Looking up hostname..."
NFS_HOST_IP=$(getent hosts "${NFS_HOST}" 2>/dev/null | awk '{print $1}' | head -n1 || true)
echo " 📍 Resolved to: ${NFS_HOST_IP}"
if [[ -z "${NFS_HOST_IP}" ]]; then
echo "❌ ERROR: Unable to resolve hostname ${NFS_HOST} to IP address"
echo "💡 Make sure ${NFS_HOST} is resolvable from this cluster"
exit 1
fi
# Check if resolved IP is localhost - auto-detect network IP instead
if [[ "${NFS_HOST_IP}" =~ ^127\. ]]; then
echo "⚠️ Warning: ${NFS_HOST} resolves to localhost (${NFS_HOST_IP})"
echo "🔍 Auto-detecting network IP for cluster access..."
# Try to find the primary network interface IP (exclude docker/k8s networks)
local network_ip=$(ip route get 8.8.8.8 | grep -oP 'src \K\S+' 2>/dev/null)
if [[ -n "${network_ip}" && ! "${network_ip}" =~ ^127\. ]]; then
echo "✅ Using detected network IP: ${network_ip}"
NFS_HOST_IP="${network_ip}"
else
echo "❌ Could not auto-detect network IP. Available IPs:"
ip addr show | grep "inet " | grep -v "127.0.0.1" | grep -v "10.42" | grep -v "172." | awk '{print " " $2}' | cut -d/ -f1
echo "💡 Please set NFS_HOST to the correct IP address manually."
exit 1
fi
fi
fi
echo "🌐 NFS server IP: ${NFS_HOST_IP}"
export NFS_HOST_IP
}
# Function to test NFS accessibility
test_nfs_accessibility() {
echo ""
echo "🔍 Testing NFS accessibility from cluster..."
# Check if showmount is available
if ! command -v showmount >/dev/null 2>&1; then
echo "📦 Installing NFS client tools..."
if command -v apt-get >/dev/null 2>&1; then
sudo apt-get update && sudo apt-get install -y nfs-common
elif command -v yum >/dev/null 2>&1; then
sudo yum install -y nfs-utils
elif command -v dnf >/dev/null 2>&1; then
sudo dnf install -y nfs-utils
else
echo "⚠️ Warning: Unable to install NFS client tools. Skipping accessibility test."
return 0
fi
fi
# Test if we can reach the NFS server
echo "🌐 Testing connection to NFS server..."
if timeout 10 showmount -e "${NFS_HOST_IP}" >/dev/null 2>&1; then
echo "✅ NFS server is accessible"
echo "📋 Available exports:"
showmount -e "${NFS_HOST_IP}"
else
echo "❌ Cannot connect to NFS server at ${NFS_HOST_IP}"
echo "💡 Make sure:"
echo " 1. NFS server is running on ${NFS_HOST}"
echo " 2. Network connectivity exists between cluster and NFS host"
echo " 3. Firewall allows NFS traffic (port 2049)"
exit 1
fi
# Test specific export
if showmount -e "${NFS_HOST_IP}" | grep -q "${NFS_MEDIA_PATH}"; then
echo "✅ Media path ${NFS_MEDIA_PATH} is exported"
else
echo "❌ Media path ${NFS_MEDIA_PATH} is not found in exports"
echo "📋 Available exports:"
showmount -e "${NFS_HOST_IP}"
echo ""
echo "💡 Run setup-nfs-host.sh on ${NFS_HOST} to configure the export"
exit 1
fi
}
# Function to create test mount
test_nfs_mount() {
echo ""
echo "🔧 Testing NFS mount functionality..."
local test_mount="/tmp/nfs-test-$$"
mkdir -p "${test_mount}"
# Try to mount the NFS export
if timeout 30 sudo mount -t nfs4 "${NFS_HOST_IP}:${NFS_MEDIA_PATH}" "${test_mount}"; then
echo "✅ NFS mount successful"
# Test read access
if ls "${test_mount}" >/dev/null 2>&1; then
echo "✅ NFS read access working"
else
echo "❌ NFS read access failed"
fi
# Unmount
sudo umount "${test_mount}" || echo "⚠️ Warning: Failed to unmount test directory"
else
echo "❌ NFS mount failed"
echo "💡 Check NFS server configuration and network connectivity"
exit 1
fi
# Clean up
rmdir "${test_mount}" 2>/dev/null || true
}
# Function to create Kubernetes resources
create_k8s_resources() {
echo ""
echo "🚀 Creating Kubernetes NFS resources..."
# Apply the NFS Kubernetes manifests using kustomize (templates already processed)
echo "📦 Applying NFS manifests..."
kubectl apply -k "${NFS_DIR}/kustomize"
echo "✅ NFS PersistentVolume and StorageClass created"
# Verify resources were created
echo "🔍 Verifying Kubernetes resources..."
if kubectl get storageclass nfs >/dev/null 2>&1; then
echo "✅ StorageClass 'nfs' created"
else
echo "❌ StorageClass 'nfs' not found"
exit 1
fi
if kubectl get pv nfs-media-pv >/dev/null 2>&1; then
echo "✅ PersistentVolume 'nfs-media-pv' created"
kubectl get pv nfs-media-pv
else
echo "❌ PersistentVolume 'nfs-media-pv' not found"
exit 1
fi
}
# Function to show usage instructions
show_usage_instructions() {
echo ""
echo "✅ === NFS Kubernetes Setup Complete ==="
echo ""
echo "💾 NFS server ${NFS_HOST} (${NFS_HOST_IP}) has been registered with the cluster"
echo ""
echo "📋 Kubernetes resources created:"
echo " - StorageClass: nfs"
echo " - PersistentVolume: nfs-media-pv (${NFS_STORAGE_CAPACITY}, ReadWriteMany)"
echo ""
echo "💡 To use NFS storage in your applications:"
echo " 1. Set storageClassName: nfs in your PVC"
echo " 2. Use accessMode: ReadWriteMany for shared access"
echo ""
echo "📝 Example PVC:"
echo "---"
echo "apiVersion: v1"
echo "kind: PersistentVolumeClaim"
echo "metadata:"
echo " name: my-nfs-pvc"
echo "spec:"
echo " accessModes:"
echo " - ReadWriteMany"
echo " storageClassName: nfs"
echo " resources:"
echo " requests:"
echo " storage: 10Gi"
echo ""
}
# Main execution
main() {
resolve_nfs_host
test_nfs_accessibility
test_nfs_mount
create_k8s_resources
show_usage_instructions
}
# Run main function
echo "🔧 Starting NFS setup process..."
main "$@"

View File

@@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- persistent-volume.yaml
- storage-class.yaml

View File

@@ -0,0 +1,23 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: nfs-media-pv
labels:
storage: nfs-media
spec:
capacity:
storage: {{ .cloud.nfs.storageCapacity }}
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: nfs
nfs:
server: {{ .cloud.nfs.host }}
path: {{ .cloud.nfs.mediaPath }}
mountOptions:
- nfsvers=4.1
- rsize=1048576
- wsize=1048576
- hard
- intr
- timeo=600

View File

@@ -0,0 +1,10 @@
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: nfs
provisioner: nfs
parameters:
server: {{ .cloud.nfs.host }}
path: {{ .cloud.nfs.mediaPath }}
reclaimPolicy: Retain
allowVolumeExpansion: true

View File

@@ -0,0 +1,306 @@
#!/bin/bash
set -e
set -o pipefail
# Navigate to script directory
SCRIPT_PATH="$(realpath "${BASH_SOURCE[0]}")"
SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
usage() {
echo "Usage: setup-nfs-host.sh [server] [media-path] [options]"
echo ""
echo "Set up NFS server on the specified host."
echo ""
echo "Examples:"
echo " setup-nfs-host.sh box-01 /data/media"
echo ""
echo "Options:"
echo " -h, --help Show this help message"
echo " -e, --export-options Set the NFS export options"
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
usage
exit 0
;;
-e|--export-options)
if [[ -z "$2" ]]; then
echo "Error: --export-options requires a value"
exit 1
else
NFS_EXPORT_OPTIONS="$2"
fi
shift 2
;;
-*)
echo "Unknown option $1"
usage
exit 1
;;
*)
# First non-option argument is server
if [[ -z "$NFS_HOST" ]]; then
export NFS_HOST="$1"
# Second non-option argument is media path
elif [[ -z "$NFS_MEDIA_PATH" ]]; then
export NFS_MEDIA_PATH="$1"
else
echo "Too many arguments"
usage
exit 1
fi
shift
;;
esac
done
echo "Setting up NFS server on this host..."
# Check if required NFS variables are configured
if [[ -z "${NFS_HOST}" ]]; then
echo "NFS_HOST not set. Please set NFS_HOST=<hostname> in your environment"
echo "Example: export NFS_HOST=box-01"
exit 1
fi
# Ensure NFS_MEDIA_PATH is explicitly set
if [[ -z "${NFS_MEDIA_PATH}" ]]; then
echo "Error: NFS_MEDIA_PATH not set. Please set it in your environment"
echo "Example: export NFS_MEDIA_PATH=/data/media"
exit 1
fi
# Set default for NFS_EXPORT_OPTIONS if not already set
if [[ -z "${NFS_EXPORT_OPTIONS}" ]]; then
export NFS_EXPORT_OPTIONS="*(rw,sync,no_subtree_check,no_root_squash)"
echo "Using default NFS_EXPORT_OPTIONS: ${NFS_EXPORT_OPTIONS}"
fi
echo "Target NFS host: ${NFS_HOST}"
echo "Media path: ${NFS_MEDIA_PATH}"
echo "Export options: ${NFS_EXPORT_OPTIONS}"
# Function to check if we're running on the correct host
check_host() {
local current_hostname=$(hostname)
if [[ "${current_hostname}" != "${NFS_HOST}" ]]; then
echo "Warning: Current host (${current_hostname}) differs from NFS_HOST (${NFS_HOST})"
echo "This script should be run on ${NFS_HOST}"
read -p "Continue anyway? (y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
fi
}
# Function to install NFS server and SMB/CIFS
install_nfs_server() {
echo "Installing NFS server and SMB/CIFS packages..."
# Detect package manager and install NFS server + Samba
if command -v apt-get >/dev/null 2>&1; then
# Debian/Ubuntu
sudo apt-get update
sudo apt-get install -y nfs-kernel-server nfs-common samba samba-common-bin
elif command -v yum >/dev/null 2>&1; then
# RHEL/CentOS
sudo yum install -y nfs-utils samba samba-client
elif command -v dnf >/dev/null 2>&1; then
# Fedora
sudo dnf install -y nfs-utils samba samba-client
else
echo "Error: Unable to detect package manager. Please install NFS server and Samba manually."
exit 1
fi
}
# Function to create media directory
create_media_directory() {
echo "Creating media directory: ${NFS_MEDIA_PATH}"
# Create directory if it doesn't exist
sudo mkdir -p "${NFS_MEDIA_PATH}"
# Set appropriate permissions
# Using 755 for directory, allowing read/execute for all, write for owner
sudo chmod 755 "${NFS_MEDIA_PATH}"
echo "Media directory created with appropriate permissions"
echo "Directory info:"
ls -la "${NFS_MEDIA_PATH}/"
}
# Function to configure NFS exports
configure_nfs_exports() {
echo "Configuring NFS exports..."
local export_line="${NFS_MEDIA_PATH} ${NFS_EXPORT_OPTIONS}"
local exports_file="/etc/exports"
# Backup existing exports file
sudo cp "${exports_file}" "${exports_file}.backup.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
# Check if export already exists
if sudo grep -q "^${NFS_MEDIA_PATH}" "${exports_file}" 2>/dev/null; then
echo "Export for ${NFS_MEDIA_PATH} already exists, updating..."
sudo sed -i "s|^${NFS_MEDIA_PATH}.*|${export_line}|" "${exports_file}"
else
echo "Adding new export for ${NFS_MEDIA_PATH}..."
echo "${export_line}" | sudo tee -a "${exports_file}"
fi
# Export the filesystems
sudo exportfs -rav
echo "NFS exports configured:"
sudo exportfs -v
}
# Function to start and enable NFS services
start_nfs_services() {
echo "Starting NFS services..."
# Start and enable NFS server
sudo systemctl enable nfs-server
sudo systemctl start nfs-server
# Also enable related services
sudo systemctl enable rpcbind
sudo systemctl start rpcbind
echo "NFS services started and enabled"
# Show service status
sudo systemctl status nfs-server --no-pager --lines=5
}
# Function to configure SMB/CIFS sharing
configure_smb_sharing() {
echo "Configuring SMB/CIFS sharing..."
local smb_config="/etc/samba/smb.conf"
local share_name="media"
# Backup existing config
sudo cp "${smb_config}" "${smb_config}.backup.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
# Check if share already exists
if sudo grep -q "^\[${share_name}\]" "${smb_config}" 2>/dev/null; then
echo "SMB share '${share_name}' already exists, updating..."
# Remove existing share section
sudo sed -i "/^\[${share_name}\]/,/^\[/{ /^\[${share_name}\]/d; /^\[/!d; }" "${smb_config}"
fi
# Add media share configuration
cat << EOF | sudo tee -a "${smb_config}"
[${share_name}]
comment = Media files for Wild Cloud
path = ${NFS_MEDIA_PATH}
browseable = yes
read only = no
guest ok = yes
create mask = 0664
directory mask = 0775
force user = $(whoami)
force group = $(whoami)
EOF
echo "SMB share configuration added"
# Test configuration
if sudo testparm -s >/dev/null 2>&1; then
echo "✓ SMB configuration is valid"
else
echo "✗ SMB configuration has errors"
sudo testparm
exit 1
fi
}
# Function to start SMB services
start_smb_services() {
echo "Starting SMB services..."
# Enable and start Samba services
sudo systemctl enable smbd
sudo systemctl start smbd
sudo systemctl enable nmbd
sudo systemctl start nmbd
echo "SMB services started and enabled"
# Show service status
sudo systemctl status smbd --no-pager --lines=3
}
# Function to test NFS setup
test_nfs_setup() {
echo "Testing NFS setup..."
# Test if NFS is responding
if command -v showmount >/dev/null 2>&1; then
echo "Available NFS exports:"
showmount -e localhost || echo "Warning: showmount failed, but NFS may still be working"
fi
# Check if the export directory is accessible
if [[ -d "${NFS_MEDIA_PATH}" ]]; then
echo "✓ Media directory exists and is accessible"
else
echo "✗ Media directory not accessible"
exit 1
fi
}
# Function to show usage instructions
show_usage_instructions() {
echo
echo "=== NFS/SMB Host Setup Complete ==="
echo
echo "NFS and SMB servers are now running on this host with media directory: ${NFS_MEDIA_PATH}"
echo
echo "Access methods:"
echo "1. NFS (for Kubernetes): Use setup-nfs-k8s.sh to register with cluster"
echo "2. SMB/CIFS (for Windows): \\\\${NFS_HOST}\\media"
echo
echo "To add media files:"
echo "- Copy directly to: ${NFS_MEDIA_PATH}"
echo "- Or mount SMB share from Windows and copy there"
echo
echo "Windows SMB mount:"
echo "- Open File Explorer"
echo "- Map network drive to: \\\\${NFS_HOST}\\media"
echo "- Or use: \\\\$(hostname -I | awk '{print $1}')\\media"
echo
echo "To verify services:"
echo "- NFS: showmount -e ${NFS_HOST}"
echo "- SMB: smbclient -L ${NFS_HOST} -N"
echo "- Status: systemctl status nfs-server smbd"
echo
echo "Current NFS exports:"
sudo exportfs -v
echo
}
# Main execution
main() {
check_host
install_nfs_server
create_media_directory
configure_nfs_exports
start_nfs_services
configure_smb_sharing
start_smb_services
test_nfs_setup
show_usage_instructions
}
# Run main function
main "$@"

View File

@@ -0,0 +1,21 @@
name: nfs
description: NFS client provisioner for external NFS storage
namespace: nfs-system
category: infrastructure
serviceConfig:
nfsHost:
path: cloud.nfs.host
prompt: "Enter NFS server hostname or IP address"
default: "192.168.1.100"
type: string
mediaPath:
path: cloud.nfs.mediaPath
prompt: "Enter NFS export path for media storage"
default: "/mnt/storage/media"
type: string
storageCapacity:
path: cloud.nfs.storageCapacity
prompt: "Enter NFS storage capacity (e.g., 1Ti, 500Gi)"
default: "1Ti"
type: string

View File

@@ -0,0 +1,52 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery"
echo "🔧 === Setting up Node Feature Discovery ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled Node Feature Discovery templates..."
if [ ! -d "${NFD_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${NFD_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying Node Feature Discovery..."
kubectl apply -k "${NFD_DIR}/kustomize"
echo "⏳ Waiting for Node Feature Discovery DaemonSet to be ready..."
kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s
echo ""
echo "✅ Node Feature Discovery installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n node-feature-discovery"
echo " kubectl get nodes --show-labels | grep feature.node.kubernetes.io"
echo ""
echo "🎮 GPU nodes should now be labeled with GPU device information:"
echo " kubectl get nodes --show-labels | grep pci-10de"

View File

@@ -0,0 +1,711 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
name: nodefeatures.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeature
listKind: NodeFeatureList
plural: nodefeatures
singular: nodefeature
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: |-
NodeFeature resource holds the features discovered for one node in the
cluster.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Specification of the NodeFeature, containing features discovered
for a node.
properties:
features:
description: Features is the full "raw" features data that has been
discovered.
properties:
attributes:
additionalProperties:
description: AttributeFeatureSet is a set of features having
string value.
properties:
elements:
additionalProperties:
type: string
description: Individual features of the feature set.
type: object
required:
- elements
type: object
description: Attributes contains all the attribute-type features
of the node.
type: object
flags:
additionalProperties:
description: FlagFeatureSet is a set of simple features only
containing names without values.
properties:
elements:
additionalProperties:
description: |-
Nil is a dummy empty struct for protobuf compatibility.
NOTE: protobuf definitions have been removed but this is kept for API compatibility.
type: object
description: Individual features of the feature set.
type: object
required:
- elements
type: object
description: Flags contains all the flag-type features of the
node.
type: object
instances:
additionalProperties:
description: InstanceFeatureSet is a set of features each of
which is an instance having multiple attributes.
properties:
elements:
description: Individual features of the feature set.
items:
description: InstanceFeature represents one instance of
a complex features, e.g. a device.
properties:
attributes:
additionalProperties:
type: string
description: Attributes of the instance feature.
type: object
required:
- attributes
type: object
type: array
required:
- elements
type: object
description: Instances contains all the instance-type features
of the node.
type: object
type: object
labels:
additionalProperties:
type: string
description: Labels is the set of node labels that are requested to
be created.
type: object
type: object
required:
- spec
type: object
served: true
storage: true
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
name: nodefeaturegroups.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeatureGroup
listKind: NodeFeatureGroupList
plural: nodefeaturegroups
shortNames:
- nfg
singular: nodefeaturegroup
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NodeFeatureGroup resource holds Node pools by featureGroup
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the rules to be evaluated.
properties:
featureGroupRules:
description: List of rules to evaluate to determine nodes that belong
in this group.
items:
description: GroupRule defines a rule for nodegroup filtering.
properties:
matchAny:
description: MatchAny specifies a list of matchers one of which
must match.
items:
description: MatchAnyElem specifies one sub-matcher of MatchAny.
properties:
matchFeatures:
description: MatchFeatures specifies a set of matcher
terms all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature
set to match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
required:
- matchFeatures
type: object
type: array
matchFeatures:
description: MatchFeatures specifies a set of matcher terms
all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature set to
match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
name:
description: Name of the rule.
type: string
required:
- name
type: object
type: array
required:
- featureGroupRules
type: object
status:
description: |-
Status of the NodeFeatureGroup after the most recent evaluation of the
specification.
properties:
nodes:
description: Nodes is a list of FeatureGroupNode in the cluster that
match the featureGroupRules
items:
properties:
name:
description: Name of the node.
type: string
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true
subresources:
status: {}
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
name: nodefeaturerules.nfd.k8s-sigs.io
spec:
group: nfd.k8s-sigs.io
names:
kind: NodeFeatureRule
listKind: NodeFeatureRuleList
plural: nodefeaturerules
shortNames:
- nfr
singular: nodefeaturerule
scope: Cluster
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: |-
NodeFeatureRule resource specifies a configuration for feature-based
customization of node objects, such as node labeling.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the rules to be evaluated.
properties:
rules:
description: Rules is a list of node customization rules.
items:
description: Rule defines a rule for node customization such as
labeling.
properties:
annotations:
additionalProperties:
type: string
description: Annotations to create if the rule matches.
type: object
extendedResources:
additionalProperties:
type: string
description: ExtendedResources to create if the rule matches.
type: object
labels:
additionalProperties:
type: string
description: Labels to create if the rule matches.
type: object
labelsTemplate:
description: |-
LabelsTemplate specifies a template to expand for dynamically generating
multiple labels. Data (after template expansion) must be keys with an
optional value (<key>[=<value>]) separated by newlines.
type: string
matchAny:
description: MatchAny specifies a list of matchers one of which
must match.
items:
description: MatchAnyElem specifies one sub-matcher of MatchAny.
properties:
matchFeatures:
description: MatchFeatures specifies a set of matcher
terms all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature
set to match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
required:
- matchFeatures
type: object
type: array
matchFeatures:
description: MatchFeatures specifies a set of matcher terms
all of which must match.
items:
description: |-
FeatureMatcherTerm defines requirements against one feature set. All
requirements (specified as MatchExpressions) are evaluated against each
element in the feature set.
properties:
feature:
description: Feature is the name of the feature set to
match against.
type: string
matchExpressions:
additionalProperties:
description: |-
MatchExpression specifies an expression to evaluate against a set of input
values. It contains an operator that is applied when matching the input and
an array of values that the operator evaluates the input against.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
description: |-
MatchExpressions is the set of per-element expressions evaluated. These
match against the value of the specified elements.
type: object
matchName:
description: |-
MatchName in an expression that is matched against the name of each
element in the feature set.
properties:
op:
description: Op is the operator to be applied.
enum:
- In
- NotIn
- InRegexp
- Exists
- DoesNotExist
- Gt
- Lt
- GtLt
- IsTrue
- IsFalse
type: string
value:
description: |-
Value is the list of values that the operand evaluates the input
against. Value should be empty if the operator is Exists, DoesNotExist,
IsTrue or IsFalse. Value should contain exactly one element if the
operator is Gt or Lt and exactly two elements if the operator is GtLt.
In other cases Value should contain at least one element.
items:
type: string
type: array
required:
- op
type: object
required:
- feature
type: object
type: array
name:
description: Name of the rule.
type: string
taints:
description: Taints to create if the rule matches.
items:
description: |-
The node this Taint is attached to has the "effect" on
any pod that does not tolerate the Taint.
properties:
effect:
description: |-
Required. The effect of the taint on pods
that do not tolerate the taint.
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: Required. The taint key to be applied to
a node.
type: string
timeAdded:
description: |-
TimeAdded represents the time at which the taint was added.
It is only written for NoExecute taints.
format: date-time
type: string
value:
description: The taint value corresponding to the taint
key.
type: string
required:
- effect
- key
type: object
type: array
vars:
additionalProperties:
type: string
description: |-
Vars is the variables to store if the rule matches. Variables do not
directly inflict any changes in the node object. However, they can be
referenced from other rules enabling more complex rule hierarchies,
without exposing intermediary output values as labels.
type: object
varsTemplate:
description: |-
VarsTemplate specifies a template to expand for dynamically generating
multiple variables. Data (after template expansion) must be keys with an
optional value (<key>[=<value>]) separated by newlines.
type: string
required:
- name
type: object
type: array
required:
- rules
type: object
required:
- spec
type: object
served: true
storage: true

View File

@@ -0,0 +1,86 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-feature-discovery-worker
namespace: node-feature-discovery
spec:
selector:
matchLabels:
name: node-feature-discovery-worker
template:
metadata:
labels:
name: node-feature-discovery-worker
spec:
serviceAccountName: node-feature-discovery
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- name: worker
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
runAsNonRoot: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
resources:
limits:
memory: 512Mi
requests:
cpu: 5m
memory: 64Mi
command:
- "nfd-worker"
args:
- "-metrics=8081"
- "-grpc-health=8082"
ports:
- containerPort: 8081
name: metrics
- containerPort: 8082
name: health
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
readOnly: true
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
- name: host-sys
mountPath: "/host-sys"
readOnly: true
- name: host-usr-lib
mountPath: "/host-usr/lib"
readOnly: true
- name: host-lib
mountPath: "/host-lib"
readOnly: true
- name: host-proc-swaps
mountPath: "/host-proc/swaps"
readOnly: true
volumes:
- name: host-boot
hostPath:
path: "/boot"
- name: host-os-release
hostPath:
path: "/etc/os-release"
- name: host-sys
hostPath:
path: "/sys"
- name: host-usr-lib
hostPath:
path: "/usr/lib"
- name: host-lib
hostPath:
path: "/lib"
- name: host-proc-swaps
hostPath:
path: "/proc/swaps"

View File

@@ -0,0 +1,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: node-feature-discovery
labels:
- pairs:
app.kubernetes.io/name: node-feature-discovery
managedBy: kustomize
partOf: wild-cloud
resources:
- namespace.yaml
- crds.yaml
- rbac.yaml
- daemonset.yaml
- master.yaml

View File

@@ -0,0 +1,49 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: node-feature-discovery-master
namespace: node-feature-discovery
spec:
replicas: 1
selector:
matchLabels:
name: node-feature-discovery-master
template:
metadata:
labels:
name: node-feature-discovery-master
spec:
serviceAccountName: node-feature-discovery
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- name: master
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
readOnlyRootFilesystem: true
runAsNonRoot: true
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- "nfd-master"
args:
- "-metrics=8081"
- "-grpc-health=8082"
ports:
- containerPort: 8081
name: metrics
- containerPort: 8082
name: health
resources:
requests:
cpu: 10m
memory: 64Mi
limits:
memory: 128Mi

View File

@@ -0,0 +1,8 @@
apiVersion: v1
kind: Namespace
metadata:
name: node-feature-discovery
labels:
pod-security.kubernetes.io/enforce: privileged
pod-security.kubernetes.io/audit: privileged
pod-security.kubernetes.io/warn: privileged

View File

@@ -0,0 +1,55 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-feature-discovery
namespace: node-feature-discovery
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: node-feature-discovery
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- get
- patch
- update
- list
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- list
- watch
- apiGroups:
- nfd.k8s-sigs.io
resources:
- nodefeatures
- nodefeaturerules
- nodefeaturegroups
verbs:
- get
- list
- watch
- create
- update
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-feature-discovery
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-feature-discovery
subjects:
- kind: ServiceAccount
name: node-feature-discovery
namespace: node-feature-discovery

View File

@@ -0,0 +1,4 @@
name: node-feature-discovery
description: Detects hardware features available on each node
namespace: node-feature-discovery
category: infrastructure

View File

@@ -0,0 +1,98 @@
# NVIDIA Device Plugin
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
## Overview
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
- Discovers NVIDIA GPUs on worker nodes
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
- Enables pods to request GPU resources
## Prerequisites
Before installing the NVIDIA Device Plugin, ensure that:
1. **NVIDIA Drivers** are installed (>= 384.81)
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
3. **nvidia-container-runtime** is configured as the default container runtime
4. Worker nodes have NVIDIA GPUs
### Talos Linux Requirements
For Talos Linux nodes, you need:
- NVIDIA drivers extension in the Talos schematic
- nvidia-container-toolkit extension
- Proper container runtime configuration
## Installation
```bash
# Configure and install the service
wild-cluster-services-configure nvidia-device-plugin
wild-cluster-install nvidia-device-plugin
```
## Verification
After installation, verify the plugin is working:
```bash
# Check plugin pods are running
kubectl get pods -n kube-system | grep nvidia
# Verify GPU resources are advertised
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
# Check GPU node labels
kubectl get nodes --show-labels | grep nvidia
```
## Usage in Applications
Once installed, applications can request GPU resources:
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-app
spec:
template:
spec:
containers:
- name: app
image: nvidia/cuda:latest
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1
```
## Troubleshooting
### Plugin Not Starting
- Verify NVIDIA drivers are installed on worker nodes
- Check that nvidia-container-toolkit is properly configured
- Ensure worker nodes are not tainted in a way that prevents scheduling
### No GPU Resources Advertised
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
- Verify NVIDIA runtime is the default container runtime
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
## Configuration
The plugin uses the following configuration:
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
- **Namespace**: `kube-system`
- **Priority Class**: `system-node-critical`
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
## References
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)

View File

@@ -0,0 +1,66 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
echo "🎮 === Setting up NVIDIA Device Plugin ==="
echo ""
# Check if we have NVIDIA GPUs in the cluster
echo "🔍 Checking for worker nodes in the cluster..."
# Check if any worker nodes exist (device plugin only runs on worker nodes)
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
if [ "$WORKER_NODES" -eq 0 ]; then
echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
exit 1
fi
echo "✅ Found $WORKER_NODES worker node(s)"
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Deploying NVIDIA Device Plugin..."
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
echo ""
echo "✅ NVIDIA Device Plugin installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n kube-system | grep nvidia"
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
echo ""
echo "🎮 GPU nodes should now be labeled with GPU product information:"
echo " kubectl get nodes --show-labels | grep nvidia"
echo ""

View File

@@ -0,0 +1,91 @@
# NVIDIA Device Plugin DaemonSet
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
# Licensed under the Apache License, Version 2.0
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
labels:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
managedBy: kustomize
partOf: wild-cloud
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
spec:
runtimeClassName: nvidia
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: CriticalAddonsOnly
operator: Exists
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: feature.node.kubernetes.io/pci-0300_10de.present
operator: In
values:
- "true"
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
securityContext:
seccompProfile:
type: RuntimeDefault
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
name: nvidia-device-plugin-ctr
env:
- name: MPS_ROOT
value: /run/nvidia/mps
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: mps-shm
mountPath: /dev/shm
- name: mps-root
mountPath: /mps
- name: cdi-root
mountPath: /var/run/cdi
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: mps-root
hostPath:
path: /run/nvidia/mps
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /run/nvidia/mps/shm
- name: cdi-root
hostPath:
path: /var/run/cdi
type: DirectoryOrCreate

View File

@@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kube-system
resources:
- daemonset.yaml
- runtimeclass.yaml
labels:
- pairs:
app.kubernetes.io/name: nvidia-device-plugin
app.kubernetes.io/component: device-plugin
managedBy: kustomize
partOf: wild-cloud

View File

@@ -0,0 +1,5 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia

View File

@@ -0,0 +1,7 @@
name: nvidia-device-plugin
description: NVIDIA device plugin for Kubernetes
namespace: nvidia-device-plugin
category: infrastructure
dependencies:
- node-feature-discovery

View File

@@ -0,0 +1,51 @@
# SMTP Configuration Service
This service configures SMTP settings for Wild Cloud applications to send transactional emails.
## Overview
The SMTP service doesn't deploy any Kubernetes resources. Instead, it helps configure global SMTP settings that can be used by Wild Cloud applications like Ghost, Gitea, and others for sending:
- Password reset emails
- User invitation emails
- Notification emails
- Other transactional emails
## Installation
```bash
./setup/cluster-services/smtp/install.sh
```
## Configuration
The setup script will prompt for:
- **SMTP Host**: Your email provider's SMTP server (e.g., `email-smtp.us-east-2.amazonaws.com` for AWS SES)
- **SMTP Port**: Usually `465` for SSL or `587` for STARTTLS
- **SMTP User**: Username or access key for authentication
- **From Address**: Default sender email address
- **SMTP Password**: Your password, secret key, or API key (entered securely)
## Supported Providers
- **AWS SES**: Use your Access Key ID as user and Secret Access Key as password
- **Gmail/Google Workspace**: Use your email as user and an App Password as password
- **SendGrid**: Use `apikey` as user and your API key as password
- **Mailgun**: Use your Mailgun username and password
- **Other SMTP providers**: Use your standard SMTP credentials
## Applications That Use SMTP
- **Ghost**: User management, password resets, notifications
- **Gitea**: User registration, password resets, notifications
- **OpenProject**: User invitations, notifications
- **Future applications**: Any app that needs to send emails
## Testing
After configuration, test SMTP by:
1. Deploying an application that uses email (like Ghost)
2. Using password reset or user invitation features
3. Checking application logs for SMTP connection issues

View File

@@ -0,0 +1,36 @@
name: smtp
description: SMTP relay service for cluster applications
namespace: smtp-system
category: infrastructure
serviceConfig:
smtpHost:
path: cloud.smtp.host
prompt: "Enter SMTP host (e.g., email-smtp.us-east-2.amazonaws.com for AWS SES)"
default: ""
type: string
smtpPort:
path: cloud.smtp.port
prompt: "Enter SMTP port (usually 465 for SSL, 587 for STARTTLS)"
default: "465"
type: string
smtpUser:
path: cloud.smtp.user
prompt: "Enter SMTP username/access key"
default: ""
type: string
smtpFrom:
path: cloud.smtp.from
prompt: "Enter default 'from' email address"
default: "no-reply@{{ .cloud.domain }}"
type: string
smtpTls:
path: cloud.smtp.tls
prompt: "Enable TLS? (true/false)"
default: "true"
type: string
smtpStartTls:
path: cloud.smtp.startTls
prompt: "Enable STARTTLS? (true/false)"
default: "true"
type: string

View File

@@ -0,0 +1,31 @@
# Traefik
- https://doc.traefik.io/traefik/providers/kubernetes-ingress/
Ingress RDs can be create for any service. The routes specificed in the Ingress are added automatically to the Traefik proxy.
Traefik serves all incoming network traffic on ports 80 and 443 to their appropriate services based on the route.
## Notes
These kustomize templates were created with:
```bash
helm-chart-to-kustomize traefik/traefik traefik traefik values.yaml
```
With values.yaml being:
```yaml
ingressRoute:
dashboard:
enabled: true
matchRule: Host(`dashboard.localhost`)
entryPoints:
- web
providers:
kubernetesGateway:
enabled: true
gateway:
namespacePolicy: All
```

View File

@@ -0,0 +1,72 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
TRAEFIK_DIR="${CLUSTER_SETUP_DIR}/traefik"
echo "🌐 === Setting up Traefik Ingress Controller ==="
echo ""
# Check MetalLB dependency
echo "🔍 Verifying MetalLB is ready (required for Traefik LoadBalancer service)..."
kubectl wait --for=condition=Ready pod -l component=controller -n metallb-system --timeout=60s 2>/dev/null || {
echo "⚠️ MetalLB controller not ready, but continuing with Traefik installation"
echo "💡 Note: Traefik LoadBalancer service may not get external IP without MetalLB"
}
# Install required CRDs first
echo "📦 Installing Gateway API CRDs..."
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml
echo "📦 Installing Traefik CRDs..."
kubectl apply -f https://raw.githubusercontent.com/traefik/traefik/v3.4/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml
echo "⏳ Waiting for CRDs to be established..."
kubectl wait --for condition=established crd/gateways.gateway.networking.k8s.io --timeout=60s
kubectl wait --for condition=established crd/gatewayclasses.gateway.networking.k8s.io --timeout=60s
kubectl wait --for condition=established crd/ingressroutes.traefik.io --timeout=60s
kubectl wait --for condition=established crd/middlewares.traefik.io --timeout=60s
# Templates should already be compiled
echo "📦 Using pre-compiled Traefik templates..."
if [ ! -d "${TRAEFIK_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${TRAEFIK_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
# Apply Traefik using kustomize
echo "🚀 Deploying Traefik..."
kubectl apply -k ${TRAEFIK_DIR}/kustomize
# Wait for Traefik to be ready
echo "⏳ Waiting for Traefik to be ready..."
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=120s
echo ""
echo "✅ Traefik installed successfully"
echo ""
echo "💡 To verify the installation:"
echo " kubectl get pods -n traefik"
echo " kubectl get svc -n traefik"
echo ""

View File

@@ -0,0 +1,13 @@
apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
name: internal-only
namespace: kube-system
spec:
ipWhiteList:
# Restrict to local private network ranges - adjust these to match your network
sourceRange:
- 127.0.0.1/32 # localhost
- 10.0.0.0/8 # Private network
- 172.16.0.0/12 # Private network
- 192.168.0.0/16 # Private network

View File

@@ -0,0 +1,13 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- templates/deployment.yaml
- templates/gatewayclass.yaml
- templates/gateway.yaml
- templates/ingressclass.yaml
- templates/ingressroute.yaml
- templates/rbac/clusterrolebinding.yaml
- templates/rbac/clusterrole.yaml
- templates/rbac/serviceaccount.yaml
- templates/service.yaml

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: traefik

View File

@@ -0,0 +1,130 @@
---
# Source: traefik/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: traefik
namespace: traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
annotations:
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
minReadySeconds: 0
template:
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "9100"
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
spec:
serviceAccountName: traefik
automountServiceAccountToken: true
terminationGracePeriodSeconds: 60
hostNetwork: false
containers:
- image: docker.io/traefik:v3.4.1
imagePullPolicy: IfNotPresent
name: traefik
resources:
readinessProbe:
httpGet:
path: /ping
port: 8080
scheme: HTTP
failureThreshold: 1
initialDelaySeconds: 2
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 2
livenessProbe:
httpGet:
path: /ping
port: 8080
scheme: HTTP
failureThreshold: 3
initialDelaySeconds: 2
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 2
lifecycle:
ports:
- name: metrics
containerPort: 9100
protocol: TCP
- name: traefik
containerPort: 8080
protocol: TCP
- name: web
containerPort: 8000
protocol: TCP
- name: websecure
containerPort: 8443
protocol: TCP
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
volumeMounts:
- name: data
mountPath: /data
- name: tmp
mountPath: /tmp
args:
- "--global.checkNewVersion"
- "--entryPoints.metrics.address=:9100/tcp"
- "--entryPoints.traefik.address=:8080/tcp"
- "--entryPoints.web.address=:8000/tcp"
- "--entryPoints.websecure.address=:8443/tcp"
- "--api.dashboard=true"
- "--ping=true"
- "--metrics.prometheus=true"
- "--metrics.prometheus.entrypoint=metrics"
- "--providers.kubernetescrd"
- "--providers.kubernetescrd.allowEmptyServices=true"
- "--providers.kubernetesingress"
- "--providers.kubernetesingress.allowEmptyServices=true"
- "--providers.kubernetesingress.ingressendpoint.publishedservice=traefik/traefik"
- "--providers.kubernetesgateway"
- "--providers.kubernetesgateway.statusaddress.service.name=traefik"
- "--providers.kubernetesgateway.statusaddress.service.namespace=traefik"
- "--entryPoints.websecure.http.tls=true"
- "--log.level=INFO"
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumes:
- name: data
emptyDir: {}
- name: tmp
emptyDir: {}
securityContext:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532

View File

@@ -0,0 +1,18 @@
---
# Source: traefik/templates/gateway.yaml
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: traefik-gateway
namespace: traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
spec:
gatewayClassName: traefik
listeners:
- name: web
port: 8000
protocol: HTTP

View File

@@ -0,0 +1,13 @@
---
# Source: traefik/templates/gatewayclass.yaml
apiVersion: gateway.networking.k8s.io/v1
kind: GatewayClass
metadata:
name: traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
spec:
controllerName: traefik.io/gateway-controller

View File

@@ -0,0 +1,15 @@
---
# Source: traefik/templates/ingressclass.yaml
apiVersion: networking.k8s.io/v1
kind: IngressClass
metadata:
annotations:
ingressclass.kubernetes.io/is-default-class: "true"
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
name: traefik
spec:
controller: traefik.io/ingress-controller

View File

@@ -0,0 +1,21 @@
---
# Source: traefik/templates/ingressroute.yaml
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: traefik-dashboard
namespace: traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
spec:
entryPoints:
- web
routes:
- match: Host(`dashboard.localhost`)
kind: Rule
services:
- kind: TraefikService
name: api@internal

View File

@@ -0,0 +1,108 @@
---
# Source: traefik/templates/rbac/clusterrole.yaml
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: traefik-traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
rules:
- apiGroups:
- ""
resources:
- configmaps
- nodes
- services
verbs:
- get
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- watch
- apiGroups:
- extensions
- networking.k8s.io
resources:
- ingressclasses
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- extensions
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- traefik.io
resources:
- ingressroutes
- ingressroutetcps
- ingressrouteudps
- middlewares
- middlewaretcps
- serverstransports
- serverstransporttcps
- tlsoptions
- tlsstores
- traefikservices
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- namespaces
- secrets
- configmaps
verbs:
- get
- list
- watch
- apiGroups:
- gateway.networking.k8s.io
resources:
- backendtlspolicies
- gatewayclasses
- gateways
- grpcroutes
- httproutes
- referencegrants
- tcproutes
- tlsroutes
verbs:
- get
- list
- watch
- apiGroups:
- gateway.networking.k8s.io
resources:
- backendtlspolicies/status
- gatewayclasses/status
- gateways/status
- grpcroutes/status
- httproutes/status
- tcproutes/status
- tlsroutes/status
verbs:
- update

View File

@@ -0,0 +1,19 @@
---
# Source: traefik/templates/rbac/clusterrolebinding.yaml
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: traefik-traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: traefik-traefik
subjects:
- kind: ServiceAccount
name: traefik
namespace: traefik

View File

@@ -0,0 +1,14 @@
---
# Source: traefik/templates/rbac/serviceaccount.yaml
kind: ServiceAccount
apiVersion: v1
metadata:
name: traefik
namespace: traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
annotations:
automountServiceAccountToken: false

View File

@@ -0,0 +1,27 @@
---
# Source: traefik/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: traefik
labels:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
helm.sh/chart: traefik-36.1.0
app.kubernetes.io/managed-by: Helm
annotations:
spec:
type: LoadBalancer
selector:
app.kubernetes.io/name: traefik
app.kubernetes.io/instance: traefik-traefik
ports:
- port: 80
name: web
targetPort: web
protocol: TCP
- port: 443
name: websecure
targetPort: websecure
protocol: TCP

View File

@@ -0,0 +1,28 @@
---
# Traefik service configuration with static LoadBalancer IP
apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: kube-system
annotations:
# Get a stable IP from MetalLB
metallb.universe.tf/address-pool: production
metallb.universe.tf/allow-shared-ip: traefik-lb
labels:
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
spec:
type: LoadBalancer
loadBalancerIP: {{ .cluster.loadBalancerIp }}
selector:
app.kubernetes.io/instance: traefik-kube-system
app.kubernetes.io/name: traefik
ports:
- name: web
port: 80
targetPort: web
- name: websecure
port: 443
targetPort: websecure
externalTrafficPolicy: Local

View File

@@ -0,0 +1,10 @@
name: traefik
description: Cloud-native reverse proxy and ingress controller
namespace: traefik
category: infrastructure
dependencies:
- metallb
configReferences:
- cluster.loadBalancerIp

View File

@@ -0,0 +1,44 @@
#!/bin/bash
set -e
set -o pipefail
# Ensure WILD_INSTANCE is set
if [ -z "${WILD_INSTANCE}" ]; then
echo "❌ ERROR: WILD_INSTANCE is not set"
exit 1
fi
# Ensure WILD_CENTRAL_DATA is set
if [ -z "${WILD_CENTRAL_DATA}" ]; then
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
exit 1
fi
# Ensure KUBECONFIG is set
if [ -z "${KUBECONFIG}" ]; then
echo "❌ ERROR: KUBECONFIG is not set"
exit 1
fi
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
UTILS_DIR="${CLUSTER_SETUP_DIR}/utils"
echo "🔧 === Setting up Cluster Utilities ==="
echo ""
# Templates should already be compiled
echo "📦 Using pre-compiled utils templates..."
if [ ! -d "${UTILS_DIR}/kustomize" ]; then
echo "❌ ERROR: Compiled templates not found at ${UTILS_DIR}/kustomize"
echo "Templates should be compiled before deployment."
exit 1
fi
echo "🚀 Applying utility manifests..."
kubectl apply -f ${UTILS_DIR}/kustomize/
echo ""
echo "✅ Cluster utilities installed successfully"
echo ""
echo "💡 Utility resources have been deployed to the cluster"

View File

@@ -0,0 +1,71 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: debug
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: netdebug
namespace: debug
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: netdebug
subjects:
- kind: ServiceAccount
name: netdebug
namespace: debug
roleRef:
kind: ClusterRole
name: cluster-admin
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: netdebug
namespace: debug
labels:
app: netdebug
spec:
replicas: 1
selector:
matchLabels:
app: netdebug
template:
metadata:
labels:
app: netdebug
spec:
serviceAccountName: netdebug
containers:
- name: netdebug
image: nicolaka/netshoot:latest
command: ["/bin/bash"]
args: ["-c", "while true; do sleep 3600; done"]
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 100m
memory: 128Mi
securityContext:
privileged: true
---
apiVersion: v1
kind: Service
metadata:
name: netdebug
namespace: debug
spec:
selector:
app: netdebug
ports:
- port: 22
targetPort: 22
name: ssh
type: ClusterIP

View File

@@ -0,0 +1,4 @@
name: utils
description: Utility tools and scripts for cluster administration
namespace: utils-system
category: infrastructure

1
internal/setup/dnsmasq/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
setup-bundle/

Some files were not shown because too many files have changed in this diff Show More