Adds setup files.
This commit is contained in:
15
setup/README.md
Normal file
15
setup/README.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Setup instructions
|
||||
|
||||
Install dependencies:
|
||||
|
||||
Follow the instructions to [set up a dnsmasq machine](./dnsmasq/README.md).
|
||||
|
||||
Follow the instructions to [set up cluster nodes](./cluster-nodes/README.md).
|
||||
|
||||
Follow the instruction to set up [cluster services](./cluster-services/README.md).
|
||||
|
||||
Now make sure everything works:
|
||||
|
||||
```bash
|
||||
wild-health
|
||||
```
|
||||
80
setup/cluster-nodes/init-cluster.sh
Executable file
80
setup/cluster-nodes/init-cluster.sh
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Talos cluster initialization script
|
||||
# This script performs one-time cluster setup: generates secrets, base configs, and sets up talosctl
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Check if WC_HOME is set
|
||||
if [ -z "${WC_HOME:-}" ]; then
|
||||
echo "Error: WC_HOME environment variable not set. Run \`source ./env.sh\`."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
|
||||
|
||||
# Get cluster configuration from config.yaml
|
||||
CLUSTER_NAME=$(wild-config cluster.name)
|
||||
VIP=$(wild-config cluster.nodes.control.vip)
|
||||
TALOS_VERSION=$(wild-config cluster.nodes.talos.version)
|
||||
|
||||
echo "Initializing Talos cluster: $CLUSTER_NAME"
|
||||
echo "VIP: $VIP"
|
||||
echo "Talos version: $TALOS_VERSION"
|
||||
|
||||
# Create directories
|
||||
mkdir -p generated final patch
|
||||
|
||||
# Check if cluster secrets already exist
|
||||
if [ -f "generated/secrets.yaml" ]; then
|
||||
echo ""
|
||||
echo "⚠️ Cluster secrets already exist!"
|
||||
echo "This will regenerate ALL cluster certificates and invalidate existing nodes."
|
||||
echo ""
|
||||
read -p "Do you want to continue? (y/N): " -r
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
echo "Cancelled."
|
||||
exit 0
|
||||
fi
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Generate fresh cluster secrets
|
||||
echo "Generating cluster secrets..."
|
||||
cd generated
|
||||
talosctl gen secrets -o secrets.yaml --force
|
||||
|
||||
echo "Generating base machine configs..."
|
||||
talosctl gen config --with-secrets secrets.yaml "$CLUSTER_NAME" "https://$VIP:6443" --force
|
||||
cd ..
|
||||
|
||||
# Setup talosctl context
|
||||
echo "Setting up talosctl context..."
|
||||
|
||||
# Remove existing context if it exists
|
||||
talosctl config context "$CLUSTER_NAME" --remove 2>/dev/null || true
|
||||
|
||||
# Merge new configuration
|
||||
talosctl config merge ./generated/talosconfig
|
||||
talosctl config endpoint "$VIP"
|
||||
|
||||
echo ""
|
||||
echo "✅ Cluster initialization complete!"
|
||||
echo ""
|
||||
echo "Cluster details:"
|
||||
echo " - Name: $CLUSTER_NAME"
|
||||
echo " - VIP: $VIP"
|
||||
echo " - Secrets: generated/secrets.yaml"
|
||||
echo " - Base configs: generated/controlplane.yaml, generated/worker.yaml"
|
||||
echo ""
|
||||
echo "Talosctl context configured:"
|
||||
talosctl config info
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Register nodes with hardware detection:"
|
||||
echo " ./detect-node-hardware.sh <maintenance-ip> <node-number>"
|
||||
echo ""
|
||||
echo "2. Generate machine configurations:"
|
||||
echo " ./generate-machine-configs.sh"
|
||||
echo ""
|
||||
echo "3. Apply configurations to nodes"
|
||||
23
setup/cluster-nodes/patch.templates/controlplane.yaml
Normal file
23
setup/cluster-nodes/patch.templates/controlplane.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
machine:
|
||||
install:
|
||||
disk: {{ index .cluster.nodes.active "{{NODE_NAME}}" "disk" }}
|
||||
image: factory.talos.dev/metal-installer/{{SCHEMATIC_ID}}:{{VERSION}}
|
||||
network:
|
||||
hostname: "{{NODE_NAME}}"
|
||||
interfaces:
|
||||
- interface: {{ index .cluster.nodes.active "{{NODE_NAME}}" "interface" }}
|
||||
dhcp: false
|
||||
addresses:
|
||||
- "{{NODE_IP}}/24"
|
||||
routes:
|
||||
- network: 0.0.0.0/0
|
||||
gateway: {{ .cloud.router.ip }}
|
||||
vip:
|
||||
ip: {{ .cluster.nodes.control.vip }}
|
||||
# cluster:
|
||||
# discovery:
|
||||
# enabled: true
|
||||
# registries:
|
||||
# service:
|
||||
# disabled: true
|
||||
# allowSchedulingOnControlPlanes: true
|
||||
23
setup/cluster-nodes/patch.templates/worker.yaml
Normal file
23
setup/cluster-nodes/patch.templates/worker.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
machine:
|
||||
install:
|
||||
disk: {{ index .cluster.nodes.active "{{NODE_NAME}}" "disk" }}
|
||||
image: factory.talos.dev/metal-installer/{{ .cluster.nodes.talos.schematicId}}:{{ .cluster.nodes.talos.version}}
|
||||
network:
|
||||
hostname: "{{NODE_NAME}}"
|
||||
interfaces:
|
||||
- interface: {{ index .cluster.nodes.active "{{NODE_NAME}}" "interface" }}
|
||||
dhcp: true
|
||||
addresses:
|
||||
- "{{NODE_IP}}/24"
|
||||
routes:
|
||||
- network: 0.0.0.0/0
|
||||
gateway: {{ .cloud.router.ip }}
|
||||
kubelet:
|
||||
extraMounts:
|
||||
- destination: /var/lib/longhorn
|
||||
type: bind
|
||||
source: /var/lib/longhorn
|
||||
options:
|
||||
- bind
|
||||
- rshared
|
||||
- rw
|
||||
63
setup/cluster-nodes/talos-schemas.yaml
Normal file
63
setup/cluster-nodes/talos-schemas.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
# Talos Version to Schematic ID Mappings
|
||||
#
|
||||
# This file contains mappings of Talos versions to their corresponding
|
||||
# default schematic IDs for wild-cloud deployments.
|
||||
#
|
||||
# Schematic IDs are generated from factory.talos.dev and include
|
||||
# common system extensions needed for typical hardware.
|
||||
#
|
||||
# To add new versions:
|
||||
# 1. Go to https://factory.talos.dev/
|
||||
# 2. Select the system extensions you need
|
||||
# 3. Generate the schematic
|
||||
# 4. Add the version and schematic ID below
|
||||
|
||||
# Format: Each schematic ID is the primary key with version and definition nested
|
||||
"434a0300db532066f1098e05ac068159371d00f0aba0a3103a0e826e83825c82":
|
||||
schematic:
|
||||
customization:
|
||||
systemExtensions:
|
||||
officialExtensions:
|
||||
- siderolabs/gvisor
|
||||
- siderolabs/intel-ucode
|
||||
- siderolabs/iscsi-tools
|
||||
- siderolabs/util-linux-tools
|
||||
"f309e674d9ad94655e2cf8a43ea1432475c717cd1885f596bd7ec852b900bc5b":
|
||||
schematic:
|
||||
customization:
|
||||
systemExtensions:
|
||||
officialExtensions:
|
||||
- siderolabs/gvisor
|
||||
- siderolabs/intel-ucode
|
||||
- siderolabs/iscsi-tools
|
||||
- siderolabs/nvidia-container-toolkit-lts
|
||||
- siderolabs/nvidia-container-toolkit-production
|
||||
- siderolabs/nvidia-fabricmanager-lts
|
||||
- siderolabs/nvidia-fabricmanager-production
|
||||
- siderolabs/nvidia-open-gpu-kernel-modules-lts
|
||||
- siderolabs/nvidia-open-gpu-kernel-modules-production
|
||||
- siderolabs/util-linux-tools"
|
||||
"56774e0894c8a3a3a9834a2aea65f24163cacf9506abbcbdc3ba135eaca4953f":
|
||||
schematic:
|
||||
customization:
|
||||
systemExtensions:
|
||||
officialExtensions:
|
||||
- siderolabs/gvisor
|
||||
- siderolabs/intel-ucode
|
||||
- siderolabs/iscsi-tools
|
||||
- siderolabs/nvidia-container-toolkit-production
|
||||
- siderolabs/nvidia-fabricmanager-production
|
||||
- siderolabs/nvidia-open-gpu-kernel-modules-production
|
||||
- siderolabs/util-linux-tools
|
||||
"9ac1424dbdf4b964154a36780dbf2215bf17d2752cd0847fa3b81d7da761457f":
|
||||
schematic:
|
||||
customization:
|
||||
systemExtensions:
|
||||
officialExtensions:
|
||||
- siderolabs/gvisor
|
||||
- siderolabs/intel-ucode
|
||||
- siderolabs/iscsi-tools
|
||||
- siderolabs/nonfree-kmod-nvidia-production
|
||||
- siderolabs/nvidia-container-toolkit-production
|
||||
- siderolabs/nvidia-fabricmanager-production
|
||||
- siderolabs/util-linux-tools
|
||||
102
setup/cluster-services/README.md
Normal file
102
setup/cluster-services/README.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Wild Cloud Cluster Services
|
||||
|
||||
Creates a fully functional personal cloud infrastructure on a bare metal Kubernetes cluster that provides:
|
||||
|
||||
1. **External access** to services via configured domain names (using ${DOMAIN})
|
||||
2. **Internal-only access** to admin interfaces (via internal.${DOMAIN} subdomains)
|
||||
3. **Secure traffic routing** with automatic TLS
|
||||
4. **Reliable networking** with proper load balancing
|
||||
|
||||
## Service Management
|
||||
|
||||
Wild Cloud uses a streamlined per-service setup approach:
|
||||
|
||||
**Primary Command**: `wild-service-setup <service> [options]`
|
||||
- **Default**: Configure and deploy service using existing templates
|
||||
- **`--fetch`**: Fetch fresh templates before setup (for updates)
|
||||
- **`--no-deploy`**: Configure only, skip deployment (for planning)
|
||||
|
||||
**Master Orchestrator**: `wild-setup-services`
|
||||
- Sets up all services in proper dependency order
|
||||
- Each service validates its prerequisites before deployment
|
||||
- Fail-fast approach with clear recovery instructions
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Internet → External DNS → MetalLB LoadBalancer → Traefik → Kubernetes Services
|
||||
↑
|
||||
Internal DNS
|
||||
↑
|
||||
Internal Network
|
||||
```
|
||||
|
||||
## Key Components
|
||||
|
||||
- **[MetalLB](metallb/README.md)** - Provides load balancing for bare metal clusters
|
||||
- **[Traefik](traefik/README.md)** - Handles ingress traffic, TLS termination, and routing
|
||||
- **[cert-manager](cert-manager/README.md)** - Manages TLS certificates
|
||||
- **[CoreDNS](coredns/README.md)** - Provides DNS resolution for services
|
||||
- **[ExternalDNS](externaldns/README.md)** - Automatic DNS record management
|
||||
- **[Longhorn](longhorn/README.md)** - Distributed storage system for persistent volumes
|
||||
- **[NFS](nfs/README.md)** - Network file system for shared media storage (optional)
|
||||
- **[Kubernetes Dashboard](kubernetes-dashboard/README.md)** - Web UI for cluster management (accessible via https://dashboard.internal.${DOMAIN})
|
||||
- **[Docker Registry](docker-registry/README.md)** - Private container registry for custom images
|
||||
- **[Utils](utils/README.md)** - Cluster utilities and debugging tools
|
||||
|
||||
## Common Usage Patterns
|
||||
|
||||
### Complete Infrastructure Setup
|
||||
```bash
|
||||
# All services with fresh templates (recommended for first-time setup)
|
||||
wild-setup-services --fetch
|
||||
|
||||
# All services using existing templates (fastest)
|
||||
wild-setup-services
|
||||
|
||||
# Configure all services but don't deploy (for planning)
|
||||
wild-setup-services --no-deploy
|
||||
```
|
||||
|
||||
### Individual Service Management
|
||||
```bash
|
||||
# Most common - reconfigure and deploy existing service
|
||||
wild-service-setup cert-manager
|
||||
|
||||
# Get fresh templates and deploy (for updates)
|
||||
wild-service-setup cert-manager --fetch
|
||||
|
||||
# Configure only, don't deploy (for planning)
|
||||
wild-service-setup cert-manager --no-deploy
|
||||
|
||||
# Fresh templates + configure + deploy
|
||||
wild-service-setup cert-manager --fetch
|
||||
```
|
||||
|
||||
### Service Dependencies
|
||||
Services are automatically deployed in dependency order:
|
||||
1. **metallb** → Load balancing foundation
|
||||
2. **traefik** → Ingress (requires metallb)
|
||||
3. **cert-manager** → TLS certificates (requires traefik)
|
||||
4. **externaldns** → DNS automation (requires cert-manager)
|
||||
5. **kubernetes-dashboard** → Admin UI (requires cert-manager)
|
||||
|
||||
Each service validates its dependencies before deployment.
|
||||
|
||||
## Idempotent Design
|
||||
|
||||
All setup is designed to be idempotent and reliable:
|
||||
|
||||
- **Atomic Operations**: Each service handles its complete lifecycle
|
||||
- **Dependency Validation**: Services check prerequisites before deployment
|
||||
- **Error Recovery**: Failed services can be individually fixed and re-run
|
||||
- **Safe Retries**: Operations can be repeated without harm
|
||||
- **Incremental Updates**: Configuration changes applied cleanly
|
||||
|
||||
Example recovery from cert-manager failure:
|
||||
```bash
|
||||
# Fix the issue, then resume
|
||||
wild-service-setup cert-manager --fetch
|
||||
# Continue with remaining services
|
||||
wild-service-setup externaldns --fetch
|
||||
```
|
||||
0
setup/cluster-services/cert-manager/README.md
Normal file
0
setup/cluster-services/cert-manager/README.md
Normal file
260
setup/cluster-services/cert-manager/install.sh
Executable file
260
setup/cluster-services/cert-manager/install.sh
Executable file
@@ -0,0 +1,260 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
|
||||
|
||||
echo "🔧 === Setting up cert-manager ==="
|
||||
echo ""
|
||||
|
||||
#######################
|
||||
# Dependencies
|
||||
#######################
|
||||
|
||||
# Check Traefik dependency
|
||||
echo "🔍 Verifying Traefik is ready (required for cert-manager)..."
|
||||
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
|
||||
echo "⚠️ Traefik not ready, but continuing with cert-manager installation"
|
||||
echo "💡 Note: cert-manager may not work properly without Traefik"
|
||||
}
|
||||
|
||||
if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${CERT_MANAGER_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Note: DNS validation and Cloudflare token setup moved to configuration phase
|
||||
# The configuration should be set via: wild config set cluster.certManager.cloudflare.*
|
||||
|
||||
########################
|
||||
# Kubernetes components
|
||||
########################
|
||||
|
||||
echo "📦 Installing cert-manager components..."
|
||||
# Using stable URL for cert-manager installation
|
||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
|
||||
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
|
||||
|
||||
# Wait for cert-manager to be ready
|
||||
echo "⏳ Waiting for cert-manager to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
|
||||
|
||||
# Create Cloudflare API token secret
|
||||
# Read token from Wild Central secrets file
|
||||
echo "🔐 Creating Cloudflare API token secret..."
|
||||
SECRETS_FILE="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}/secrets.yaml"
|
||||
CLOUDFLARE_API_TOKEN=$(yq '.cloudflare.token' "$SECRETS_FILE" 2>/dev/null)
|
||||
|
||||
CLOUDFLARE_API_TOKEN=$(echo "$CLOUDFLARE_API_TOKEN")
|
||||
if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
|
||||
echo "❌ ERROR: Cloudflare API token not found"
|
||||
echo "💡 Please set: wild secret set cloudflare.token YOUR_TOKEN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
kubectl create secret generic cloudflare-api-token \
|
||||
--namespace cert-manager \
|
||||
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Ensure webhook is fully operational
|
||||
echo "🔍 Verifying cert-manager webhook is fully operational..."
|
||||
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
|
||||
echo "⏳ Waiting for cert-manager webhook to register..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Configure cert-manager to use external DNS for challenge verification
|
||||
echo "🌐 Configuring cert-manager to use external DNS servers..."
|
||||
kubectl patch deployment cert-manager -n cert-manager --patch '
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
dnsPolicy: None
|
||||
dnsConfig:
|
||||
nameservers:
|
||||
- "1.1.1.1"
|
||||
- "8.8.8.8"
|
||||
searches:
|
||||
- cert-manager.svc.cluster.local
|
||||
- svc.cluster.local
|
||||
- cluster.local
|
||||
options:
|
||||
- name: ndots
|
||||
value: "5"'
|
||||
|
||||
# Wait for cert-manager to restart with new DNS config
|
||||
echo "⏳ Waiting for cert-manager to restart with new DNS configuration..."
|
||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||
|
||||
########################
|
||||
# Create issuers and certificates
|
||||
########################
|
||||
|
||||
# Apply Let's Encrypt issuers and certificates using kustomize
|
||||
echo "🚀 Creating Let's Encrypt issuers and certificates..."
|
||||
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
|
||||
|
||||
# Wait for issuers to be ready
|
||||
echo "⏳ Waiting for Let's Encrypt issuers to be ready..."
|
||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || echo "⚠️ Production issuer not ready, proceeding anyway..."
|
||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || echo "⚠️ Staging issuer not ready, proceeding anyway..."
|
||||
|
||||
# Give cert-manager a moment to process the certificates
|
||||
sleep 5
|
||||
|
||||
######################################
|
||||
# Fix stuck certificates and cleanup
|
||||
######################################
|
||||
|
||||
needs_restart=false
|
||||
|
||||
# STEP 1: Fix certificates stuck with 404 errors
|
||||
echo "🔍 Checking for certificates with failed issuance attempts..."
|
||||
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$stuck_certs" ]; then
|
||||
echo "⚠️ Found certificates stuck with non-existent orders, recreating them..."
|
||||
echo "$stuck_certs" | while read ns name; do
|
||||
echo "🔄 Recreating certificate $ns/$name..."
|
||||
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
|
||||
kubectl delete certificate "$name" -n "$ns"
|
||||
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
|
||||
done
|
||||
needs_restart=true
|
||||
sleep 5
|
||||
else
|
||||
echo "✅ No certificates stuck with failed orders"
|
||||
fi
|
||||
|
||||
# STEP 2: Clean up orphaned orders
|
||||
echo "🔍 Checking for orphaned ACME orders..."
|
||||
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
||||
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
|
||||
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
|
||||
sort -u || true)
|
||||
|
||||
if [ -n "$orphaned_orders" ]; then
|
||||
echo "⚠️ Found orphaned ACME orders from logs"
|
||||
for order in $orphaned_orders; do
|
||||
echo "🗑️ Deleting orphaned order: $order"
|
||||
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
|
||||
if [ -n "$orders_found" ]; then
|
||||
echo "$orders_found" | while read ns name rest; do
|
||||
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
fi
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
echo "✅ No orphaned orders found in logs"
|
||||
fi
|
||||
|
||||
# STEP 2.5: Check for Cloudflare DNS cleanup errors
|
||||
echo "🔍 Checking for Cloudflare DNS cleanup errors..."
|
||||
cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
||||
grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
|
||||
|
||||
if [ "$cloudflare_errors" -gt "0" ]; then
|
||||
echo "⚠️ Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
|
||||
echo "💡 Deleting stuck challenges and orders to allow fresh start"
|
||||
|
||||
# Delete all challenges and orders in cert-manager namespace
|
||||
kubectl delete challenges --all -n cert-manager 2>/dev/null || true
|
||||
kubectl delete orders --all -n cert-manager 2>/dev/null || true
|
||||
|
||||
needs_restart=true
|
||||
else
|
||||
echo "✅ No Cloudflare DNS cleanup errors"
|
||||
fi
|
||||
|
||||
# STEP 3: Single restart if anything needs cleaning
|
||||
if [ "$needs_restart" = true ]; then
|
||||
echo "🔄 Restarting cert-manager to clear internal state..."
|
||||
kubectl rollout restart deployment cert-manager -n cert-manager
|
||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||
echo "⏳ Waiting for cert-manager to recreate fresh challenges..."
|
||||
sleep 15
|
||||
else
|
||||
echo "✅ No restart needed - cert-manager state is clean"
|
||||
fi
|
||||
|
||||
#########################
|
||||
# Final checks
|
||||
#########################
|
||||
|
||||
# Wait for the certificates to be issued with progress feedback
|
||||
echo "⏳ Waiting for wildcard certificates to be ready (this may take several minutes)..."
|
||||
|
||||
# Function to wait for certificate with progress output
|
||||
wait_for_cert() {
|
||||
local cert_name="$1"
|
||||
local timeout=300
|
||||
local elapsed=0
|
||||
|
||||
echo " 📜 Checking $cert_name..."
|
||||
|
||||
while [ $elapsed -lt $timeout ]; do
|
||||
if kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q "True"; then
|
||||
echo " ✅ $cert_name is ready"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Show progress every 30 seconds
|
||||
if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then
|
||||
local status=$(kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "Waiting...")
|
||||
echo " ⏳ Still waiting for $cert_name... ($elapsed/${timeout}s) - $status"
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
echo " ⚠️ Timeout waiting for $cert_name (will continue anyway)"
|
||||
return 1
|
||||
}
|
||||
|
||||
wait_for_cert "wildcard-internal-wild-cloud"
|
||||
wait_for_cert "wildcard-wild-cloud"
|
||||
|
||||
# Final health check
|
||||
echo "🔍 Performing final cert-manager health check..."
|
||||
failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
|
||||
if [ "$failed_certs" -gt 0 ]; then
|
||||
echo "⚠️ Found $failed_certs certificates not in Ready state"
|
||||
echo "💡 Check certificate status with: kubectl get certificates --all-namespaces"
|
||||
echo "💡 Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
|
||||
else
|
||||
echo "✅ All certificates are in Ready state"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "✅ cert-manager setup complete!"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get certificates --all-namespaces"
|
||||
echo " kubectl get clusterissuers"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,19 @@
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: wildcard-internal-wild-cloud
|
||||
namespace: cert-manager
|
||||
spec:
|
||||
secretName: wildcard-internal-wild-cloud-tls
|
||||
dnsNames:
|
||||
- "*.{{ .cloud.internalDomain }}"
|
||||
- "{{ .cloud.internalDomain }}"
|
||||
issuerRef:
|
||||
name: letsencrypt-prod
|
||||
kind: ClusterIssuer
|
||||
duration: 2160h # 90 days
|
||||
renewBefore: 360h # 15 days
|
||||
privateKey:
|
||||
algorithm: RSA
|
||||
size: 2048
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- letsencrypt-staging-dns01.yaml
|
||||
- letsencrypt-prod-dns01.yaml
|
||||
- internal-wildcard-certificate.yaml
|
||||
- wildcard-certificate.yaml
|
||||
|
||||
# Note: cert-manager.yaml contains the main installation manifests
|
||||
# but is applied separately via URL in the install script
|
||||
@@ -0,0 +1,25 @@
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-prod
|
||||
spec:
|
||||
acme:
|
||||
email: {{ .operator.email }}
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-prod
|
||||
server: https://acme-v02.api.letsencrypt.org/directory
|
||||
solvers:
|
||||
# DNS-01 solver for wildcard certificates
|
||||
- dns01:
|
||||
cloudflare:
|
||||
apiTokenSecretRef:
|
||||
name: cloudflare-api-token
|
||||
key: api-token
|
||||
selector:
|
||||
dnsZones:
|
||||
- "{{ .cluster.certManager.cloudflare.domain }}"
|
||||
# Keep the HTTP-01 solver for non-wildcard certificates
|
||||
- http01:
|
||||
ingress:
|
||||
class: traefik
|
||||
@@ -0,0 +1,25 @@
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-staging
|
||||
spec:
|
||||
acme:
|
||||
email: {{ .operator.email }}
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-staging
|
||||
server: https://acme-staging-v02.api.letsencrypt.org/directory
|
||||
solvers:
|
||||
# DNS-01 solver for wildcard certificates
|
||||
- dns01:
|
||||
cloudflare:
|
||||
apiTokenSecretRef:
|
||||
name: cloudflare-api-token
|
||||
key: api-token
|
||||
selector:
|
||||
dnsZones:
|
||||
- "{{ .cluster.certManager.cloudflare.domain }}"
|
||||
# Keep the HTTP-01 solver for non-wildcard certificates
|
||||
- http01:
|
||||
ingress:
|
||||
class: traefik
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cert-manager
|
||||
@@ -0,0 +1,19 @@
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: wildcard-wild-cloud
|
||||
namespace: cert-manager
|
||||
spec:
|
||||
secretName: wildcard-wild-cloud-tls
|
||||
dnsNames:
|
||||
- "*.{{ .cloud.domain }}"
|
||||
- "{{ .cloud.domain }}"
|
||||
issuerRef:
|
||||
name: letsencrypt-prod
|
||||
kind: ClusterIssuer
|
||||
duration: 2160h # 90 days
|
||||
renewBefore: 360h # 15 days
|
||||
privateKey:
|
||||
algorithm: RSA
|
||||
size: 2048
|
||||
25
setup/cluster-services/cert-manager/wild-manifest.yaml
Normal file
25
setup/cluster-services/cert-manager/wild-manifest.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
name: cert-manager
|
||||
description: X.509 certificate management for Kubernetes
|
||||
namespace: cert-manager
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- traefik
|
||||
|
||||
configReferences:
|
||||
- cloud.domain
|
||||
- cloud.baseDomain
|
||||
- cloud.internalDomain
|
||||
- operator.email
|
||||
|
||||
serviceConfig:
|
||||
cloudflareDomain:
|
||||
path: cluster.certManager.cloudflare.domain
|
||||
prompt: "Enter Cloudflare domain"
|
||||
default: "{{ .cloud.baseDomain }}"
|
||||
type: string
|
||||
cloudflareZoneID:
|
||||
path: cluster.certManager.cloudflare.zoneID
|
||||
prompt: "Enter Cloudflare zone ID"
|
||||
default: ""
|
||||
type: string
|
||||
112
setup/cluster-services/common.sh
Normal file
112
setup/cluster-services/common.sh
Normal file
@@ -0,0 +1,112 @@
|
||||
#!/bin/bash
|
||||
# Common functions for Wild Central service installation scripts
|
||||
|
||||
# TODO: We should use this. :P
|
||||
|
||||
# Ensure required environment variables are set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE environment variable is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA environment variable is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the instance directory path
|
||||
get_instance_dir() {
|
||||
echo "${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
}
|
||||
|
||||
# Get the secrets file path
|
||||
get_secrets_file() {
|
||||
echo "$(get_instance_dir)/secrets.yaml"
|
||||
}
|
||||
|
||||
# Get the config file path
|
||||
get_config_file() {
|
||||
echo "$(get_instance_dir)/config.yaml"
|
||||
}
|
||||
|
||||
# Get a secret value from the secrets file
|
||||
# Usage: get_secret "path.to.secret"
|
||||
get_secret() {
|
||||
local path="$1"
|
||||
local secrets_file="$(get_secrets_file)"
|
||||
|
||||
if [ ! -f "$secrets_file" ]; then
|
||||
echo ""
|
||||
return 1
|
||||
fi
|
||||
|
||||
local value=$(yq ".$path" "$secrets_file" 2>/dev/null)
|
||||
|
||||
# Remove quotes and return empty string if null
|
||||
value=$(echo "$value" | tr -d '"')
|
||||
if [ "$value" = "null" ]; then
|
||||
echo ""
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "$value"
|
||||
}
|
||||
|
||||
# Get a config value from the config file
|
||||
# Usage: get_config "path.to.config"
|
||||
get_config() {
|
||||
local path="$1"
|
||||
local config_file="$(get_config_file)"
|
||||
|
||||
if [ ! -f "$config_file" ]; then
|
||||
echo ""
|
||||
return 1
|
||||
fi
|
||||
|
||||
local value=$(yq ".$path" "$config_file" 2>/dev/null)
|
||||
|
||||
# Remove quotes and return empty string if null
|
||||
value=$(echo "$value" | tr -d '"')
|
||||
if [ "$value" = "null" ]; then
|
||||
echo ""
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "$value"
|
||||
}
|
||||
|
||||
# Check if a secret exists and is not empty
|
||||
# Usage: require_secret "path.to.secret" "Friendly Name" "wild secret set command"
|
||||
require_secret() {
|
||||
local path="$1"
|
||||
local name="$2"
|
||||
local set_command="$3"
|
||||
|
||||
local value=$(get_secret "$path")
|
||||
|
||||
if [ -z "$value" ]; then
|
||||
echo "❌ ERROR: $name not found"
|
||||
echo "💡 Please set: $set_command"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$value"
|
||||
}
|
||||
|
||||
# Check if a config value exists and is not empty
|
||||
# Usage: require_config "path.to.config" "Friendly Name" "wild config set command"
|
||||
require_config() {
|
||||
local path="$1"
|
||||
local name="$2"
|
||||
local set_command="$3"
|
||||
|
||||
local value=$(get_config "$path")
|
||||
|
||||
if [ -z "$value" ]; then
|
||||
echo "❌ ERROR: $name not found"
|
||||
echo "💡 Please set: $set_command"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$value"
|
||||
}
|
||||
45
setup/cluster-services/coredns/README.md
Normal file
45
setup/cluster-services/coredns/README.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# CoreDNS
|
||||
|
||||
- https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/
|
||||
- https://github.com/kubernetes/dns/blob/master/docs/specification.md
|
||||
- https://coredns.io/
|
||||
|
||||
CoreDNS has the `kubernetes` plugin, so it returns all k8s service endpoints in well-known format.
|
||||
|
||||
All services and pods are registered in CoreDNS.
|
||||
|
||||
- <service-name>.<namespace>.svc.cluster.local
|
||||
- <service-name>.<namespace>
|
||||
- <service-name> (if in the same namespace)
|
||||
|
||||
- <pod-ipv4-address>.<namespace>.pod.cluster.local
|
||||
- <pod-ipv4-address>.<service-name>.<namespace>.svc.cluster.local
|
||||
|
||||
Any query for a resource in the `internal.$DOMAIN` domain will be given the IP of the Traefik proxy. We expose the CoreDNS server in the LAN via MetalLB just for this capability.
|
||||
|
||||
## Default CoreDNS Configuration
|
||||
|
||||
This is the default CoreDNS configuration, for reference:
|
||||
|
||||
```txt
|
||||
.:53 {
|
||||
errors
|
||||
health { lameduck 5s }
|
||||
ready
|
||||
log . { class error }
|
||||
prometheus :9153
|
||||
kubernetes cluster.local in-addr.arpa ip6.arpa {
|
||||
pods insecure
|
||||
fallthrough in-addr.arpa ip6.arpa
|
||||
ttl 30
|
||||
}
|
||||
forward . /etc/resolv.conf { max_concurrent 1000 }
|
||||
cache 30 {
|
||||
disable success cluster.local
|
||||
disable denial cluster.local
|
||||
}
|
||||
loop
|
||||
reload
|
||||
loadbalance
|
||||
}
|
||||
```
|
||||
57
setup/cluster-services/coredns/install.sh
Executable file
57
setup/cluster-services/coredns/install.sh
Executable file
@@ -0,0 +1,57 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
COREDNS_DIR="${CLUSTER_SETUP_DIR}/coredns"
|
||||
|
||||
echo "🔧 === Setting up CoreDNS ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled CoreDNS templates..."
|
||||
if [ ! -d "${COREDNS_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${COREDNS_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Apply the custom DNS override
|
||||
# TODO: Is this needed now that we are no longer on k3s?
|
||||
echo "🚀 Applying CoreDNS custom override configuration..."
|
||||
kubectl apply -f "${COREDNS_DIR}/kustomize/coredns-custom-config.yaml"
|
||||
|
||||
echo "🔄 Restarting CoreDNS pods to apply changes..."
|
||||
kubectl rollout restart deployment/coredns -n kube-system
|
||||
echo "⏳ Waiting for CoreDNS rollout to complete..."
|
||||
kubectl rollout status deployment/coredns -n kube-system
|
||||
|
||||
echo ""
|
||||
echo "✅ CoreDNS configured successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n kube-system -l k8s-app=kube-dns"
|
||||
echo " kubectl get svc -n kube-system coredns"
|
||||
echo " kubectl describe svc -n kube-system coredns"
|
||||
echo ""
|
||||
echo "📋 To view CoreDNS logs:"
|
||||
echo " kubectl logs -n kube-system -l k8s-app=kube-dns -f"
|
||||
@@ -0,0 +1,28 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: coredns-custom
|
||||
namespace: kube-system
|
||||
data:
|
||||
# Custom server block for internal domains. All internal domains should
|
||||
# resolve to the cluster proxy.
|
||||
internal.server: |
|
||||
{{ .cloud.internalDomain }} {
|
||||
errors
|
||||
cache 30
|
||||
reload
|
||||
template IN A {
|
||||
match (.*)\.{{ .cloud.internalDomain | strings.ReplaceAll "." "\\." }}\.
|
||||
answer "{{`{{ .Name }}`}} 60 IN A {{ .cluster.loadBalancerIp }}"
|
||||
}
|
||||
template IN AAAA {
|
||||
match (.*)\.{{ .cloud.internalDomain | strings.ReplaceAll "." "\\." }}\.
|
||||
rcode NXDOMAIN
|
||||
}
|
||||
}
|
||||
# Custom override to set external resolvers.
|
||||
external.override: |
|
||||
forward . {{ .cloud.dns.externalResolver }} {
|
||||
max_concurrent 1000
|
||||
}
|
||||
15
setup/cluster-services/coredns/wild-manifest.yaml
Normal file
15
setup/cluster-services/coredns/wild-manifest.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
name: coredns
|
||||
description: DNS server for internal cluster DNS resolution
|
||||
namespace: kube-system
|
||||
category: infrastructure
|
||||
|
||||
configReferences:
|
||||
- cloud.internalDomain
|
||||
- cluster.loadBalancerIp
|
||||
|
||||
serviceConfig:
|
||||
externalResolver:
|
||||
path: cloud.dns.externalResolver
|
||||
prompt: "Enter external DNS resolver"
|
||||
default: "8.8.8.8"
|
||||
type: string
|
||||
0
setup/cluster-services/docker-registry/README.md
Normal file
0
setup/cluster-services/docker-registry/README.md
Normal file
53
setup/cluster-services/docker-registry/install.sh
Executable file
53
setup/cluster-services/docker-registry/install.sh
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
DOCKER_REGISTRY_DIR="${CLUSTER_SETUP_DIR}/docker-registry"
|
||||
|
||||
echo "🔧 === Setting up Docker Registry ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled Docker Registry templates..."
|
||||
if [ ! -d "${DOCKER_REGISTRY_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${DOCKER_REGISTRY_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying Docker Registry..."
|
||||
kubectl apply -k "${DOCKER_REGISTRY_DIR}/kustomize"
|
||||
|
||||
echo "⏳ Waiting for Docker Registry to be ready..."
|
||||
kubectl wait --for=condition=available --timeout=300s deployment/docker-registry -n docker-registry
|
||||
|
||||
echo ""
|
||||
echo "✅ Docker Registry installed successfully"
|
||||
echo ""
|
||||
echo "📊 Deployment status:"
|
||||
kubectl get pods -n docker-registry
|
||||
kubectl get services -n docker-registry
|
||||
echo ""
|
||||
echo "💡 To use the registry:"
|
||||
echo " docker tag myimage registry.local/myimage"
|
||||
echo " docker push registry.local/myimage"
|
||||
@@ -0,0 +1,36 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: docker-registry
|
||||
labels:
|
||||
app: docker-registry
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: docker-registry
|
||||
strategy:
|
||||
rollingUpdate:
|
||||
maxSurge: 0
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: docker-registry
|
||||
spec:
|
||||
containers:
|
||||
- image: registry:3.0.0
|
||||
name: docker-registry
|
||||
ports:
|
||||
- containerPort: 5000
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- mountPath: /var/lib/registry
|
||||
name: docker-registry-storage
|
||||
readOnly: false
|
||||
volumes:
|
||||
- name: docker-registry-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: docker-registry-pvc
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: docker-registry
|
||||
spec:
|
||||
rules:
|
||||
- host: {{ .cloud.dockerRegistryHost }}
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: docker-registry
|
||||
port:
|
||||
number: 5000
|
||||
tls:
|
||||
- hosts:
|
||||
- {{ .cloud.dockerRegistryHost }}
|
||||
secretName: wildcard-internal-wild-cloud-tls
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: docker-registry
|
||||
labels:
|
||||
- includeSelectors: true
|
||||
pairs:
|
||||
app: docker-registry
|
||||
managedBy: wild-cloud
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- ingress.yaml
|
||||
- service.yaml
|
||||
- namespace.yaml
|
||||
- pvc.yaml
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: docker-registry
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: docker-registry-pvc
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .cluster.dockerRegistry.storage }}
|
||||
@@ -0,0 +1,13 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: docker-registry
|
||||
labels:
|
||||
app: docker-registry
|
||||
spec:
|
||||
ports:
|
||||
- port: 5000
|
||||
targetPort: 5000
|
||||
selector:
|
||||
app: docker-registry
|
||||
20
setup/cluster-services/docker-registry/wild-manifest.yaml
Normal file
20
setup/cluster-services/docker-registry/wild-manifest.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
name: docker-registry
|
||||
description: Private Docker image registry for cluster
|
||||
namespace: docker-registry
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- traefik
|
||||
- cert-manager
|
||||
|
||||
serviceConfig:
|
||||
registryHost:
|
||||
path: cloud.dockerRegistryHost
|
||||
prompt: "Enter Docker Registry hostname"
|
||||
default: "registry.{{ .cloud.internalDomain }}"
|
||||
type: string
|
||||
storage:
|
||||
path: cluster.dockerRegistry.storage
|
||||
prompt: "Enter Docker Registry storage size"
|
||||
default: "100Gi"
|
||||
type: string
|
||||
14
setup/cluster-services/externaldns/README.md
Normal file
14
setup/cluster-services/externaldns/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
# External DNS
|
||||
|
||||
See: https://github.com/kubernetes-sigs/external-dns
|
||||
|
||||
ExternalDNS allows you to keep selected zones (via --domain-filter) synchronized with Ingresses and Services of type=LoadBalancer and nodes in various DNS providers.
|
||||
|
||||
Currently, we are only configured to use CloudFlare.
|
||||
|
||||
Docs: https://github.com/kubernetes-sigs/external-dns/blob/master/docs/tutorials/cloudflare.md
|
||||
|
||||
Any Ingress that has metatdata.annotions with
|
||||
external-dns.alpha.kubernetes.io/hostname: `<something>.${DOMAIN}`
|
||||
|
||||
will have Cloudflare records created by External DNS.
|
||||
79
setup/cluster-services/externaldns/install.sh
Executable file
79
setup/cluster-services/externaldns/install.sh
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
EXTERNALDNS_DIR="${CLUSTER_SETUP_DIR}/externaldns"
|
||||
|
||||
echo "🌐 === Setting up ExternalDNS ==="
|
||||
echo ""
|
||||
|
||||
# Check cert-manager dependency
|
||||
echo "🔍 Verifying cert-manager is ready (required for ExternalDNS)..."
|
||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=60s 2>/dev/null && \
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=60s 2>/dev/null || {
|
||||
echo "⚠️ cert-manager not ready, but continuing with ExternalDNS installation"
|
||||
echo "💡 Note: ExternalDNS may not work properly without cert-manager"
|
||||
}
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled ExternalDNS templates..."
|
||||
if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${EXTERNALDNS_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Apply ExternalDNS manifests using kustomize
|
||||
echo "🚀 Deploying ExternalDNS..."
|
||||
kubectl apply -k ${EXTERNALDNS_DIR}/kustomize
|
||||
|
||||
# Setup Cloudflare API token secret
|
||||
echo "🔐 Creating Cloudflare API token secret..."
|
||||
SECRETS_FILE="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}/secrets.yaml"
|
||||
CLOUDFLARE_API_TOKEN=$(yq '.cloudflare.token' "$SECRETS_FILE" 2>/dev/null | tr -d '"')
|
||||
|
||||
if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
|
||||
echo "❌ ERROR: Cloudflare API token not found."
|
||||
echo "💡 Please set: wild secret set cloudflare.token YOUR_TOKEN"
|
||||
exit 1
|
||||
fi
|
||||
kubectl create secret generic cloudflare-api-token \
|
||||
--namespace externaldns \
|
||||
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Wait for ExternalDNS to be ready
|
||||
echo "⏳ Waiting for Cloudflare ExternalDNS to be ready..."
|
||||
kubectl rollout status deployment/external-dns -n externaldns --timeout=60s
|
||||
|
||||
# echo "⏳ Waiting for CoreDNS ExternalDNS to be ready..."
|
||||
# kubectl rollout status deployment/external-dns-coredns -n externaldns --timeout=60s
|
||||
|
||||
echo ""
|
||||
echo "✅ ExternalDNS installed successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n externaldns"
|
||||
echo " kubectl logs -n externaldns -l app=external-dns -f"
|
||||
echo " kubectl logs -n externaldns -l app=external-dns-coredns -f"
|
||||
echo ""
|
||||
@@ -0,0 +1,39 @@
|
||||
---
|
||||
# CloudFlare provider for ExternalDNS
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: external-dns
|
||||
namespace: externaldns
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: external-dns
|
||||
strategy:
|
||||
type: Recreate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: external-dns
|
||||
spec:
|
||||
serviceAccountName: external-dns
|
||||
containers:
|
||||
- name: external-dns
|
||||
image: registry.k8s.io/external-dns/external-dns:v0.13.4
|
||||
args:
|
||||
- --source=service
|
||||
- --source=ingress
|
||||
- --txt-owner-id={{ .cluster.externalDns.ownerId }}
|
||||
- --provider=cloudflare
|
||||
- --domain-filter=payne.io
|
||||
#- --exclude-domains=internal.${DOMAIN}
|
||||
- --cloudflare-dns-records-per-page=5000
|
||||
- --publish-internal-services
|
||||
- --no-cloudflare-proxied
|
||||
- --log-level=debug
|
||||
env:
|
||||
- name: CF_API_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: cloudflare-api-token
|
||||
key: api-token
|
||||
@@ -0,0 +1,35 @@
|
||||
---
|
||||
# Common RBAC resources for all ExternalDNS deployments
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: external-dns
|
||||
namespace: externaldns
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: external-dns
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["services", "endpoints", "pods"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
- apiGroups: ["extensions", "networking.k8s.io"]
|
||||
resources: ["ingresses"]
|
||||
verbs: ["get", "watch", "list"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: external-dns-viewer
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: external-dns
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: external-dns
|
||||
namespace: externaldns
|
||||
@@ -0,0 +1,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- externaldns-rbac.yaml
|
||||
- externaldns-cloudflare.yaml
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: externaldns
|
||||
15
setup/cluster-services/externaldns/wild-manifest.yaml
Normal file
15
setup/cluster-services/externaldns/wild-manifest.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
name: externaldns
|
||||
description: Automatically configures DNS records for services
|
||||
namespace: externaldns
|
||||
category: infrastructure
|
||||
|
||||
configReferences:
|
||||
- cloud.internalDomain
|
||||
- cluster.name
|
||||
|
||||
serviceConfig:
|
||||
ownerId:
|
||||
path: cluster.externalDns.ownerId
|
||||
prompt: "Enter ExternalDNS owner ID (unique identifier for this cluster)"
|
||||
default: "wild-cloud-{{ .cluster.name }}"
|
||||
type: string
|
||||
91
setup/cluster-services/kubernetes-dashboard/install.sh
Executable file
91
setup/cluster-services/kubernetes-dashboard/install.sh
Executable file
@@ -0,0 +1,91 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
KUBERNETES_DASHBOARD_DIR="${CLUSTER_SETUP_DIR}/kubernetes-dashboard"
|
||||
|
||||
echo "🎮 === Setting up Kubernetes Dashboard ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled Dashboard templates..."
|
||||
if [ ! -d "${KUBERNETES_DASHBOARD_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${KUBERNETES_DASHBOARD_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NAMESPACE="kubernetes-dashboard"
|
||||
|
||||
# Apply the official dashboard installation
|
||||
echo "🚀 Installing Kubernetes Dashboard core components..."
|
||||
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
|
||||
|
||||
# Wait for cert-manager certificates to be ready
|
||||
echo "🔐 Waiting for cert-manager certificates to be ready..."
|
||||
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || echo "⚠️ Warning: Internal wildcard certificate not ready yet"
|
||||
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || echo "⚠️ Warning: Wildcard certificate not ready yet"
|
||||
|
||||
# Copying cert-manager secrets to the dashboard namespace (if available)
|
||||
echo "📋 Copying cert-manager secrets to dashboard namespace..."
|
||||
if kubectl get secret wildcard-internal-wild-cloud-tls -n cert-manager >/dev/null 2>&1; then
|
||||
kubectl get secret wildcard-internal-wild-cloud-tls -n cert-manager -o yaml | \
|
||||
sed "s/namespace: cert-manager/namespace: ${NAMESPACE}/" | \
|
||||
kubectl apply -f -
|
||||
else
|
||||
echo "⚠️ Warning: wildcard-internal-wild-cloud-tls secret not yet available"
|
||||
fi
|
||||
|
||||
if kubectl get secret wildcard-wild-cloud-tls -n cert-manager >/dev/null 2>&1; then
|
||||
kubectl get secret wildcard-wild-cloud-tls -n cert-manager -o yaml | \
|
||||
sed "s/namespace: cert-manager/namespace: ${NAMESPACE}/" | \
|
||||
kubectl apply -f -
|
||||
else
|
||||
echo "⚠️ Warning: wildcard-wild-cloud-tls secret not yet available"
|
||||
fi
|
||||
|
||||
# Apply dashboard customizations using kustomize
|
||||
echo "🔧 Applying dashboard customizations..."
|
||||
kubectl apply -k "${KUBERNETES_DASHBOARD_DIR}/kustomize"
|
||||
|
||||
# Restart CoreDNS to pick up the changes
|
||||
# echo "🔄 Restarting CoreDNS to pick up DNS changes..."
|
||||
# kubectl delete pods -n kube-system -l k8s-app=kube-dns
|
||||
|
||||
# Wait for dashboard to be ready
|
||||
echo "⏳ Waiting for Kubernetes Dashboard to be ready..."
|
||||
kubectl rollout status deployment/kubernetes-dashboard -n $NAMESPACE --timeout=60s
|
||||
|
||||
echo ""
|
||||
echo "✅ Kubernetes Dashboard installed successfully"
|
||||
echo ""
|
||||
# INTERNAL_DOMAIN should be available in environment (set from config before deployment)
|
||||
if [ -n "${INTERNAL_DOMAIN}" ]; then
|
||||
echo "🌐 Access the dashboard at: https://dashboard.${INTERNAL_DOMAIN}"
|
||||
else
|
||||
echo "💡 Access the dashboard via the configured internal domain"
|
||||
fi
|
||||
echo ""
|
||||
echo "💡 To get the authentication token:"
|
||||
echo " kubectl create token admin-user -n kubernetes-dashboard"
|
||||
echo ""
|
||||
@@ -0,0 +1,32 @@
|
||||
---
|
||||
# Service Account and RBAC for Dashboard admin access
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: dashboard-admin
|
||||
namespace: kubernetes-dashboard
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: dashboard-admin
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: dashboard-admin
|
||||
namespace: kubernetes-dashboard
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: cluster-admin
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
---
|
||||
# Token for dashboard-admin
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: dashboard-admin-token
|
||||
namespace: kubernetes-dashboard
|
||||
annotations:
|
||||
kubernetes.io/service-account.name: dashboard-admin
|
||||
type: kubernetes.io/service-account-token
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
# Internal-only middleware
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: internal-only
|
||||
namespace: kubernetes-dashboard
|
||||
spec:
|
||||
ipWhiteList:
|
||||
# Restrict to local private network ranges
|
||||
sourceRange:
|
||||
- 127.0.0.1/32 # localhost
|
||||
- 10.0.0.0/8 # Private network
|
||||
- 172.16.0.0/12 # Private network
|
||||
- 192.168.0.0/16 # Private network
|
||||
|
||||
---
|
||||
# HTTPS redirect middleware
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: dashboard-redirect-scheme
|
||||
namespace: kubernetes-dashboard
|
||||
spec:
|
||||
redirectScheme:
|
||||
scheme: https
|
||||
permanent: true
|
||||
|
||||
---
|
||||
# IngressRoute for Dashboard
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: kubernetes-dashboard-https
|
||||
namespace: kubernetes-dashboard
|
||||
spec:
|
||||
entryPoints:
|
||||
- websecure
|
||||
routes:
|
||||
- match: Host(`dashboard.{{ .cloud.internalDomain }}`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: internal-only
|
||||
namespace: kubernetes-dashboard
|
||||
services:
|
||||
- name: kubernetes-dashboard
|
||||
port: 443
|
||||
serversTransport: dashboard-transport
|
||||
tls:
|
||||
secretName: wildcard-internal-wild-cloud-tls
|
||||
|
||||
---
|
||||
# HTTP to HTTPS redirect.
|
||||
# FIXME: Is this needed?
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: kubernetes-dashboard-http
|
||||
namespace: kubernetes-dashboard
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
routes:
|
||||
- match: Host(`dashboard.{{ .cloud.internalDomain }}`)
|
||||
kind: Rule
|
||||
middlewares:
|
||||
- name: dashboard-redirect-scheme
|
||||
namespace: kubernetes-dashboard
|
||||
services:
|
||||
- name: kubernetes-dashboard
|
||||
port: 443
|
||||
serversTransport: dashboard-transport
|
||||
|
||||
---
|
||||
# ServersTransport for HTTPS backend with skip verify.
|
||||
# FIXME: Is this needed?
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: ServersTransport
|
||||
metadata:
|
||||
name: dashboard-transport
|
||||
namespace: kubernetes-dashboard
|
||||
spec:
|
||||
insecureSkipVerify: true
|
||||
serverName: dashboard.{{ .cloud.internalDomain }}
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- dashboard-admin-rbac.yaml
|
||||
- dashboard-kube-system.yaml
|
||||
@@ -0,0 +1,11 @@
|
||||
name: kubernetes-dashboard
|
||||
description: Web-based Kubernetes user interface
|
||||
namespace: kubernetes-dashboard
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- traefik
|
||||
- cert-manager
|
||||
|
||||
configReferences:
|
||||
- cloud.internalDomain
|
||||
20
setup/cluster-services/longhorn/README.md
Normal file
20
setup/cluster-services/longhorn/README.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Longhorn Storage
|
||||
|
||||
See: [Longhorn Docs v 1.8.1](https://longhorn.io/docs/1.8.1/deploy/install/install-with-kubectl/)
|
||||
|
||||
## Installation Notes
|
||||
|
||||
- Manifest copied from https://raw.githubusercontent.com/longhorn/longhorn/v1.8.1/deploy/longhorn.yaml
|
||||
- Using kustomize to apply custom configuration (see `kustomization.yaml`)
|
||||
|
||||
## Important Settings
|
||||
|
||||
- **Number of Replicas**: Set to 1 (default is 3) to accommodate smaller clusters
|
||||
- This avoids "degraded" volumes when fewer than 3 nodes are available
|
||||
- For production with 3+ nodes, consider changing back to 3 for better availability
|
||||
|
||||
## Common Operations
|
||||
|
||||
- View volumes: `kubectl get volumes.longhorn.io -n longhorn-system`
|
||||
- Check volume status: `kubectl describe volumes.longhorn.io <volume-name> -n longhorn-system`
|
||||
- Access Longhorn UI: Set up port-forwarding with `kubectl -n longhorn-system port-forward service/longhorn-frontend 8080:80`
|
||||
52
setup/cluster-services/longhorn/install.sh
Executable file
52
setup/cluster-services/longhorn/install.sh
Executable file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
LONGHORN_DIR="${CLUSTER_SETUP_DIR}/longhorn"
|
||||
|
||||
echo "🔧 === Setting up Longhorn ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled Longhorn templates..."
|
||||
if [ ! -d "${LONGHORN_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${LONGHORN_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying Longhorn..."
|
||||
kubectl apply -k ${LONGHORN_DIR}/kustomize/
|
||||
|
||||
echo "⏳ Waiting for Longhorn to be ready..."
|
||||
kubectl wait --for=condition=available --timeout=300s deployment/longhorn-driver-deployer -n longhorn-system || true
|
||||
|
||||
echo ""
|
||||
echo "✅ Longhorn installed successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n longhorn-system"
|
||||
echo " kubectl get storageclass"
|
||||
echo ""
|
||||
echo "🌐 To access the Longhorn UI:"
|
||||
echo " kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80"
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: longhorn-ingress
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
rules:
|
||||
- host: "longhorn.{{ .cloud.internalDomain }}"
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: longhorn-frontend
|
||||
port:
|
||||
number: 80
|
||||
tls:
|
||||
- secretName: wildcard-internal-wild-cloud-tls
|
||||
hosts:
|
||||
- "longhorn.{{ .cloud.internalDomain }}"
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- longhorn.yaml
|
||||
- ingress.yaml
|
||||
5189
setup/cluster-services/longhorn/kustomize.template/longhorn.yaml
Normal file
5189
setup/cluster-services/longhorn/kustomize.template/longhorn.yaml
Normal file
File diff suppressed because it is too large
Load Diff
7
setup/cluster-services/longhorn/wild-manifest.yaml
Normal file
7
setup/cluster-services/longhorn/wild-manifest.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
name: longhorn
|
||||
description: Cloud-native distributed block storage for Kubernetes
|
||||
namespace: longhorn-system
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- traefik
|
||||
0
setup/cluster-services/metallb/README.md
Normal file
0
setup/cluster-services/metallb/README.md
Normal file
56
setup/cluster-services/metallb/install.sh
Executable file
56
setup/cluster-services/metallb/install.sh
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
METALLB_DIR="${CLUSTER_SETUP_DIR}/metallb"
|
||||
|
||||
echo "🔧 === Setting up MetalLB ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled MetalLB templates..."
|
||||
if [ ! -d "${METALLB_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${METALLB_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying MetalLB installation..."
|
||||
kubectl apply -k ${METALLB_DIR}/kustomize/installation
|
||||
|
||||
echo "⏳ Waiting for MetalLB controller to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/controller -n metallb-system --timeout=60s
|
||||
echo "⏳ Extra buffer for webhook initialization..."
|
||||
sleep 10
|
||||
|
||||
echo "⚙️ Applying MetalLB configuration..."
|
||||
kubectl apply -k ${METALLB_DIR}/kustomize/configuration
|
||||
|
||||
echo ""
|
||||
echo "✅ MetalLB installed and configured successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n metallb-system"
|
||||
echo " kubectl get ipaddresspools.metallb.io -n metallb-system"
|
||||
echo ""
|
||||
echo "🌐 MetalLB will now provide LoadBalancer IPs for your services"
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace: metallb-system
|
||||
resources:
|
||||
- pool.yaml
|
||||
@@ -0,0 +1,19 @@
|
||||
---
|
||||
apiVersion: metallb.io/v1beta1
|
||||
kind: IPAddressPool
|
||||
metadata:
|
||||
name: first-pool
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
addresses:
|
||||
- {{ .cluster.ipAddressPool }}
|
||||
|
||||
---
|
||||
apiVersion: metallb.io/v1beta1
|
||||
kind: L2Advertisement
|
||||
metadata:
|
||||
name: l2-advertisement
|
||||
namespace: metallb-system
|
||||
spec:
|
||||
ipAddressPools:
|
||||
- first-pool
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace: metallb-system
|
||||
resources:
|
||||
- github.com/metallb/metallb/config/native?ref=v0.15.0
|
||||
19
setup/cluster-services/metallb/wild-manifest.yaml
Normal file
19
setup/cluster-services/metallb/wild-manifest.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
name: metallb
|
||||
description: Bare metal load-balancer for Kubernetes
|
||||
namespace: metallb-system
|
||||
category: infrastructure
|
||||
|
||||
configReferences:
|
||||
- cluster.name
|
||||
|
||||
serviceConfig:
|
||||
ipRange:
|
||||
path: cluster.ipAddressPool
|
||||
prompt: "Enter IP range for MetalLB (e.g., 192.168.1.240-192.168.1.250)"
|
||||
default: "192.168.1.240-192.168.1.250"
|
||||
type: string
|
||||
loadBalancerIp:
|
||||
path: cluster.loadBalancerIp
|
||||
prompt: "Enter primary load balancer IP"
|
||||
default: "192.168.1.240"
|
||||
type: string
|
||||
60
setup/cluster-services/nfs/README.md
Normal file
60
setup/cluster-services/nfs/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# NFS Setup (Optional)
|
||||
|
||||
The infrastructure supports optional NFS (Network File System) for shared media storage across the cluster. If your config.yaml contains the `cloud.nfs` section, the NFS server will be set up automatically.
|
||||
|
||||
## Host Setup
|
||||
|
||||
First, set up the NFS server on your chosen host.
|
||||
|
||||
```bash
|
||||
./setup-nfs-host.sh <host> <media-path>
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
./setup-nfs-host.sh box-01 /srv/nfs
|
||||
```
|
||||
|
||||
## Cluster Integration
|
||||
|
||||
Add to your `config.yaml`:
|
||||
|
||||
```yaml
|
||||
cloud:
|
||||
nfs:
|
||||
host: box-01
|
||||
mediaPath: /srv/nfs
|
||||
storageCapacity: 250Gi # Max size for PersistentVolume
|
||||
```
|
||||
|
||||
And now you can run the nfs cluster setup:
|
||||
|
||||
```bash
|
||||
setup/setup-nfs-host.sh
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
- Automatic IP detection - Uses network IP even when hostname resolves to localhost
|
||||
- Cluster-wide access - Any pod can mount the NFS share regardless of node placement
|
||||
- Configurable capacity - Set PersistentVolume size via `NFS_STORAGE_CAPACITY`
|
||||
- ReadWriteMany - Multiple pods can simultaneously access the same storage
|
||||
|
||||
## Usage
|
||||
|
||||
Applications can use NFS storage by setting `storageClassName: nfs` in their PVCs:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: media-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
storageClassName: nfs
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi
|
||||
```
|
||||
255
setup/cluster-services/nfs/install.sh
Executable file
255
setup/cluster-services/nfs/install.sh
Executable file
@@ -0,0 +1,255 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CONFIG_FILE="${INSTANCE_DIR}/config.yaml"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
NFS_DIR="${CLUSTER_SETUP_DIR}/nfs"
|
||||
|
||||
echo "💾 === Registering NFS Server with Kubernetes Cluster ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled NFS templates..."
|
||||
if [ ! -d "${NFS_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${NFS_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NFS_HOST="$(yq '.cloud.nfs.host' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
|
||||
NFS_MEDIA_PATH="$(yq '.cloud.nfs.mediaPath' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
|
||||
NFS_STORAGE_CAPACITY="$(yq '.cloud.nfs.storageCapacity' "${CONFIG_FILE}" 2>/dev/null | tr -d '"')"
|
||||
|
||||
echo "📋 NFS Configuration:"
|
||||
echo " Host: ${NFS_HOST}"
|
||||
echo " Media path: ${NFS_MEDIA_PATH}"
|
||||
echo " Storage capacity: ${NFS_STORAGE_CAPACITY}"
|
||||
echo ""
|
||||
|
||||
# Validate required config values
|
||||
if [ -z "${NFS_HOST}" ] || [ "${NFS_HOST}" = "null" ]; then
|
||||
echo "❌ ERROR: cloud.nfs.host not set in config"
|
||||
exit 1
|
||||
fi
|
||||
if [ -z "${NFS_MEDIA_PATH}" ] || [ "${NFS_MEDIA_PATH}" = "null" ]; then
|
||||
echo "❌ ERROR: cloud.nfs.mediaPath not set in config"
|
||||
exit 1
|
||||
fi
|
||||
if [ -z "${NFS_STORAGE_CAPACITY}" ] || [ "${NFS_STORAGE_CAPACITY}" = "null" ]; then
|
||||
echo "❌ ERROR: cloud.nfs.storageCapacity not set in config"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to resolve NFS host to IP
|
||||
resolve_nfs_host() {
|
||||
echo "🌐 Resolving NFS host: ${NFS_HOST}"
|
||||
if [[ "${NFS_HOST}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
# NFS_HOST is already an IP address
|
||||
NFS_HOST_IP="${NFS_HOST}"
|
||||
echo " ℹ️ Host is already an IP address"
|
||||
else
|
||||
# Resolve hostname to IP
|
||||
echo " 🔍 Looking up hostname..."
|
||||
NFS_HOST_IP=$(getent hosts "${NFS_HOST}" 2>/dev/null | awk '{print $1}' | head -n1 || true)
|
||||
echo " 📍 Resolved to: ${NFS_HOST_IP}"
|
||||
if [[ -z "${NFS_HOST_IP}" ]]; then
|
||||
echo "❌ ERROR: Unable to resolve hostname ${NFS_HOST} to IP address"
|
||||
echo "💡 Make sure ${NFS_HOST} is resolvable from this cluster"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if resolved IP is localhost - auto-detect network IP instead
|
||||
if [[ "${NFS_HOST_IP}" =~ ^127\. ]]; then
|
||||
echo "⚠️ Warning: ${NFS_HOST} resolves to localhost (${NFS_HOST_IP})"
|
||||
echo "🔍 Auto-detecting network IP for cluster access..."
|
||||
|
||||
# Try to find the primary network interface IP (exclude docker/k8s networks)
|
||||
local network_ip=$(ip route get 8.8.8.8 | grep -oP 'src \K\S+' 2>/dev/null)
|
||||
|
||||
if [[ -n "${network_ip}" && ! "${network_ip}" =~ ^127\. ]]; then
|
||||
echo "✅ Using detected network IP: ${network_ip}"
|
||||
NFS_HOST_IP="${network_ip}"
|
||||
else
|
||||
echo "❌ Could not auto-detect network IP. Available IPs:"
|
||||
ip addr show | grep "inet " | grep -v "127.0.0.1" | grep -v "10.42" | grep -v "172." | awk '{print " " $2}' | cut -d/ -f1
|
||||
echo "💡 Please set NFS_HOST to the correct IP address manually."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "🌐 NFS server IP: ${NFS_HOST_IP}"
|
||||
export NFS_HOST_IP
|
||||
}
|
||||
|
||||
# Function to test NFS accessibility
|
||||
test_nfs_accessibility() {
|
||||
echo ""
|
||||
echo "🔍 Testing NFS accessibility from cluster..."
|
||||
|
||||
# Check if showmount is available
|
||||
if ! command -v showmount >/dev/null 2>&1; then
|
||||
echo "📦 Installing NFS client tools..."
|
||||
if command -v apt-get >/dev/null 2>&1; then
|
||||
sudo apt-get update && sudo apt-get install -y nfs-common
|
||||
elif command -v yum >/dev/null 2>&1; then
|
||||
sudo yum install -y nfs-utils
|
||||
elif command -v dnf >/dev/null 2>&1; then
|
||||
sudo dnf install -y nfs-utils
|
||||
else
|
||||
echo "⚠️ Warning: Unable to install NFS client tools. Skipping accessibility test."
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test if we can reach the NFS server
|
||||
echo "🌐 Testing connection to NFS server..."
|
||||
if timeout 10 showmount -e "${NFS_HOST_IP}" >/dev/null 2>&1; then
|
||||
echo "✅ NFS server is accessible"
|
||||
echo "📋 Available exports:"
|
||||
showmount -e "${NFS_HOST_IP}"
|
||||
else
|
||||
echo "❌ Cannot connect to NFS server at ${NFS_HOST_IP}"
|
||||
echo "💡 Make sure:"
|
||||
echo " 1. NFS server is running on ${NFS_HOST}"
|
||||
echo " 2. Network connectivity exists between cluster and NFS host"
|
||||
echo " 3. Firewall allows NFS traffic (port 2049)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test specific export
|
||||
if showmount -e "${NFS_HOST_IP}" | grep -q "${NFS_MEDIA_PATH}"; then
|
||||
echo "✅ Media path ${NFS_MEDIA_PATH} is exported"
|
||||
else
|
||||
echo "❌ Media path ${NFS_MEDIA_PATH} is not found in exports"
|
||||
echo "📋 Available exports:"
|
||||
showmount -e "${NFS_HOST_IP}"
|
||||
echo ""
|
||||
echo "💡 Run setup-nfs-host.sh on ${NFS_HOST} to configure the export"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to create test mount
|
||||
test_nfs_mount() {
|
||||
echo ""
|
||||
echo "🔧 Testing NFS mount functionality..."
|
||||
|
||||
local test_mount="/tmp/nfs-test-$$"
|
||||
mkdir -p "${test_mount}"
|
||||
|
||||
# Try to mount the NFS export
|
||||
if timeout 30 sudo mount -t nfs4 "${NFS_HOST_IP}:${NFS_MEDIA_PATH}" "${test_mount}"; then
|
||||
echo "✅ NFS mount successful"
|
||||
|
||||
# Test read access
|
||||
if ls "${test_mount}" >/dev/null 2>&1; then
|
||||
echo "✅ NFS read access working"
|
||||
else
|
||||
echo "❌ NFS read access failed"
|
||||
fi
|
||||
|
||||
# Unmount
|
||||
sudo umount "${test_mount}" || echo "⚠️ Warning: Failed to unmount test directory"
|
||||
else
|
||||
echo "❌ NFS mount failed"
|
||||
echo "💡 Check NFS server configuration and network connectivity"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean up
|
||||
rmdir "${test_mount}" 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Function to create Kubernetes resources
|
||||
create_k8s_resources() {
|
||||
echo ""
|
||||
echo "🚀 Creating Kubernetes NFS resources..."
|
||||
|
||||
# Apply the NFS Kubernetes manifests using kustomize (templates already processed)
|
||||
echo "📦 Applying NFS manifests..."
|
||||
kubectl apply -k "${NFS_DIR}/kustomize"
|
||||
|
||||
echo "✅ NFS PersistentVolume and StorageClass created"
|
||||
|
||||
# Verify resources were created
|
||||
echo "🔍 Verifying Kubernetes resources..."
|
||||
if kubectl get storageclass nfs >/dev/null 2>&1; then
|
||||
echo "✅ StorageClass 'nfs' created"
|
||||
else
|
||||
echo "❌ StorageClass 'nfs' not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if kubectl get pv nfs-media-pv >/dev/null 2>&1; then
|
||||
echo "✅ PersistentVolume 'nfs-media-pv' created"
|
||||
kubectl get pv nfs-media-pv
|
||||
else
|
||||
echo "❌ PersistentVolume 'nfs-media-pv' not found"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to show usage instructions
|
||||
show_usage_instructions() {
|
||||
echo ""
|
||||
echo "✅ === NFS Kubernetes Setup Complete ==="
|
||||
echo ""
|
||||
echo "💾 NFS server ${NFS_HOST} (${NFS_HOST_IP}) has been registered with the cluster"
|
||||
echo ""
|
||||
echo "📋 Kubernetes resources created:"
|
||||
echo " - StorageClass: nfs"
|
||||
echo " - PersistentVolume: nfs-media-pv (${NFS_STORAGE_CAPACITY}, ReadWriteMany)"
|
||||
echo ""
|
||||
echo "💡 To use NFS storage in your applications:"
|
||||
echo " 1. Set storageClassName: nfs in your PVC"
|
||||
echo " 2. Use accessMode: ReadWriteMany for shared access"
|
||||
echo ""
|
||||
echo "📝 Example PVC:"
|
||||
echo "---"
|
||||
echo "apiVersion: v1"
|
||||
echo "kind: PersistentVolumeClaim"
|
||||
echo "metadata:"
|
||||
echo " name: my-nfs-pvc"
|
||||
echo "spec:"
|
||||
echo " accessModes:"
|
||||
echo " - ReadWriteMany"
|
||||
echo " storageClassName: nfs"
|
||||
echo " resources:"
|
||||
echo " requests:"
|
||||
echo " storage: 10Gi"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
resolve_nfs_host
|
||||
test_nfs_accessibility
|
||||
test_nfs_mount
|
||||
create_k8s_resources
|
||||
show_usage_instructions
|
||||
}
|
||||
|
||||
# Run main function
|
||||
echo "🔧 Starting NFS setup process..."
|
||||
main "$@"
|
||||
@@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- persistent-volume.yaml
|
||||
- storage-class.yaml
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: nfs-media-pv
|
||||
labels:
|
||||
storage: nfs-media
|
||||
spec:
|
||||
capacity:
|
||||
storage: {{ .cloud.nfs.storageCapacity }}
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
storageClassName: nfs
|
||||
nfs:
|
||||
server: {{ .cloud.nfs.host }}
|
||||
path: {{ .cloud.nfs.mediaPath }}
|
||||
mountOptions:
|
||||
- nfsvers=4.1
|
||||
- rsize=1048576
|
||||
- wsize=1048576
|
||||
- hard
|
||||
- intr
|
||||
- timeo=600
|
||||
@@ -0,0 +1,10 @@
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: nfs
|
||||
provisioner: nfs
|
||||
parameters:
|
||||
server: {{ .cloud.nfs.host }}
|
||||
path: {{ .cloud.nfs.mediaPath }}
|
||||
reclaimPolicy: Retain
|
||||
allowVolumeExpansion: true
|
||||
306
setup/cluster-services/nfs/setup-nfs-host.sh
Executable file
306
setup/cluster-services/nfs/setup-nfs-host.sh
Executable file
@@ -0,0 +1,306 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Navigate to script directory
|
||||
SCRIPT_PATH="$(realpath "${BASH_SOURCE[0]}")"
|
||||
SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
usage() {
|
||||
echo "Usage: setup-nfs-host.sh [server] [media-path] [options]"
|
||||
echo ""
|
||||
echo "Set up NFS server on the specified host."
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " setup-nfs-host.sh box-01 /data/media"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -h, --help Show this help message"
|
||||
echo " -e, --export-options Set the NFS export options"
|
||||
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
-e|--export-options)
|
||||
if [[ -z "$2" ]]; then
|
||||
echo "Error: --export-options requires a value"
|
||||
exit 1
|
||||
else
|
||||
NFS_EXPORT_OPTIONS="$2"
|
||||
fi
|
||||
shift 2
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
# First non-option argument is server
|
||||
if [[ -z "$NFS_HOST" ]]; then
|
||||
export NFS_HOST="$1"
|
||||
# Second non-option argument is media path
|
||||
elif [[ -z "$NFS_MEDIA_PATH" ]]; then
|
||||
export NFS_MEDIA_PATH="$1"
|
||||
else
|
||||
echo "Too many arguments"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Setting up NFS server on this host..."
|
||||
|
||||
# Check if required NFS variables are configured
|
||||
if [[ -z "${NFS_HOST}" ]]; then
|
||||
echo "NFS_HOST not set. Please set NFS_HOST=<hostname> in your environment"
|
||||
echo "Example: export NFS_HOST=box-01"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure NFS_MEDIA_PATH is explicitly set
|
||||
if [[ -z "${NFS_MEDIA_PATH}" ]]; then
|
||||
echo "Error: NFS_MEDIA_PATH not set. Please set it in your environment"
|
||||
echo "Example: export NFS_MEDIA_PATH=/data/media"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Set default for NFS_EXPORT_OPTIONS if not already set
|
||||
if [[ -z "${NFS_EXPORT_OPTIONS}" ]]; then
|
||||
export NFS_EXPORT_OPTIONS="*(rw,sync,no_subtree_check,no_root_squash)"
|
||||
echo "Using default NFS_EXPORT_OPTIONS: ${NFS_EXPORT_OPTIONS}"
|
||||
fi
|
||||
|
||||
echo "Target NFS host: ${NFS_HOST}"
|
||||
echo "Media path: ${NFS_MEDIA_PATH}"
|
||||
echo "Export options: ${NFS_EXPORT_OPTIONS}"
|
||||
|
||||
# Function to check if we're running on the correct host
|
||||
check_host() {
|
||||
local current_hostname=$(hostname)
|
||||
if [[ "${current_hostname}" != "${NFS_HOST}" ]]; then
|
||||
echo "Warning: Current host (${current_hostname}) differs from NFS_HOST (${NFS_HOST})"
|
||||
echo "This script should be run on ${NFS_HOST}"
|
||||
read -p "Continue anyway? (y/N): " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to install NFS server and SMB/CIFS
|
||||
install_nfs_server() {
|
||||
echo "Installing NFS server and SMB/CIFS packages..."
|
||||
|
||||
# Detect package manager and install NFS server + Samba
|
||||
if command -v apt-get >/dev/null 2>&1; then
|
||||
# Debian/Ubuntu
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y nfs-kernel-server nfs-common samba samba-common-bin
|
||||
elif command -v yum >/dev/null 2>&1; then
|
||||
# RHEL/CentOS
|
||||
sudo yum install -y nfs-utils samba samba-client
|
||||
elif command -v dnf >/dev/null 2>&1; then
|
||||
# Fedora
|
||||
sudo dnf install -y nfs-utils samba samba-client
|
||||
else
|
||||
echo "Error: Unable to detect package manager. Please install NFS server and Samba manually."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to create media directory
|
||||
create_media_directory() {
|
||||
echo "Creating media directory: ${NFS_MEDIA_PATH}"
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
sudo mkdir -p "${NFS_MEDIA_PATH}"
|
||||
|
||||
# Set appropriate permissions
|
||||
# Using 755 for directory, allowing read/execute for all, write for owner
|
||||
sudo chmod 755 "${NFS_MEDIA_PATH}"
|
||||
|
||||
echo "Media directory created with appropriate permissions"
|
||||
echo "Directory info:"
|
||||
ls -la "${NFS_MEDIA_PATH}/"
|
||||
}
|
||||
|
||||
# Function to configure NFS exports
|
||||
configure_nfs_exports() {
|
||||
echo "Configuring NFS exports..."
|
||||
|
||||
local export_line="${NFS_MEDIA_PATH} ${NFS_EXPORT_OPTIONS}"
|
||||
local exports_file="/etc/exports"
|
||||
|
||||
# Backup existing exports file
|
||||
sudo cp "${exports_file}" "${exports_file}.backup.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
|
||||
|
||||
# Check if export already exists
|
||||
if sudo grep -q "^${NFS_MEDIA_PATH}" "${exports_file}" 2>/dev/null; then
|
||||
echo "Export for ${NFS_MEDIA_PATH} already exists, updating..."
|
||||
sudo sed -i "s|^${NFS_MEDIA_PATH}.*|${export_line}|" "${exports_file}"
|
||||
else
|
||||
echo "Adding new export for ${NFS_MEDIA_PATH}..."
|
||||
echo "${export_line}" | sudo tee -a "${exports_file}"
|
||||
fi
|
||||
|
||||
# Export the filesystems
|
||||
sudo exportfs -rav
|
||||
|
||||
echo "NFS exports configured:"
|
||||
sudo exportfs -v
|
||||
}
|
||||
|
||||
# Function to start and enable NFS services
|
||||
start_nfs_services() {
|
||||
echo "Starting NFS services..."
|
||||
|
||||
# Start and enable NFS server
|
||||
sudo systemctl enable nfs-server
|
||||
sudo systemctl start nfs-server
|
||||
|
||||
# Also enable related services
|
||||
sudo systemctl enable rpcbind
|
||||
sudo systemctl start rpcbind
|
||||
|
||||
echo "NFS services started and enabled"
|
||||
|
||||
# Show service status
|
||||
sudo systemctl status nfs-server --no-pager --lines=5
|
||||
}
|
||||
|
||||
# Function to configure SMB/CIFS sharing
|
||||
configure_smb_sharing() {
|
||||
echo "Configuring SMB/CIFS sharing..."
|
||||
|
||||
local smb_config="/etc/samba/smb.conf"
|
||||
local share_name="media"
|
||||
|
||||
# Backup existing config
|
||||
sudo cp "${smb_config}" "${smb_config}.backup.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
|
||||
|
||||
# Check if share already exists
|
||||
if sudo grep -q "^\[${share_name}\]" "${smb_config}" 2>/dev/null; then
|
||||
echo "SMB share '${share_name}' already exists, updating..."
|
||||
# Remove existing share section
|
||||
sudo sed -i "/^\[${share_name}\]/,/^\[/{ /^\[${share_name}\]/d; /^\[/!d; }" "${smb_config}"
|
||||
fi
|
||||
|
||||
# Add media share configuration
|
||||
cat << EOF | sudo tee -a "${smb_config}"
|
||||
|
||||
[${share_name}]
|
||||
comment = Media files for Wild Cloud
|
||||
path = ${NFS_MEDIA_PATH}
|
||||
browseable = yes
|
||||
read only = no
|
||||
guest ok = yes
|
||||
create mask = 0664
|
||||
directory mask = 0775
|
||||
force user = $(whoami)
|
||||
force group = $(whoami)
|
||||
EOF
|
||||
|
||||
echo "SMB share configuration added"
|
||||
|
||||
# Test configuration
|
||||
if sudo testparm -s >/dev/null 2>&1; then
|
||||
echo "✓ SMB configuration is valid"
|
||||
else
|
||||
echo "✗ SMB configuration has errors"
|
||||
sudo testparm
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to start SMB services
|
||||
start_smb_services() {
|
||||
echo "Starting SMB services..."
|
||||
|
||||
# Enable and start Samba services
|
||||
sudo systemctl enable smbd
|
||||
sudo systemctl start smbd
|
||||
sudo systemctl enable nmbd
|
||||
sudo systemctl start nmbd
|
||||
|
||||
echo "SMB services started and enabled"
|
||||
|
||||
# Show service status
|
||||
sudo systemctl status smbd --no-pager --lines=3
|
||||
}
|
||||
|
||||
# Function to test NFS setup
|
||||
test_nfs_setup() {
|
||||
echo "Testing NFS setup..."
|
||||
|
||||
# Test if NFS is responding
|
||||
if command -v showmount >/dev/null 2>&1; then
|
||||
echo "Available NFS exports:"
|
||||
showmount -e localhost || echo "Warning: showmount failed, but NFS may still be working"
|
||||
fi
|
||||
|
||||
# Check if the export directory is accessible
|
||||
if [[ -d "${NFS_MEDIA_PATH}" ]]; then
|
||||
echo "✓ Media directory exists and is accessible"
|
||||
else
|
||||
echo "✗ Media directory not accessible"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to show usage instructions
|
||||
show_usage_instructions() {
|
||||
echo
|
||||
echo "=== NFS/SMB Host Setup Complete ==="
|
||||
echo
|
||||
echo "NFS and SMB servers are now running on this host with media directory: ${NFS_MEDIA_PATH}"
|
||||
echo
|
||||
echo "Access methods:"
|
||||
echo "1. NFS (for Kubernetes): Use setup-nfs-k8s.sh to register with cluster"
|
||||
echo "2. SMB/CIFS (for Windows): \\\\${NFS_HOST}\\media"
|
||||
echo
|
||||
echo "To add media files:"
|
||||
echo "- Copy directly to: ${NFS_MEDIA_PATH}"
|
||||
echo "- Or mount SMB share from Windows and copy there"
|
||||
echo
|
||||
echo "Windows SMB mount:"
|
||||
echo "- Open File Explorer"
|
||||
echo "- Map network drive to: \\\\${NFS_HOST}\\media"
|
||||
echo "- Or use: \\\\$(hostname -I | awk '{print $1}')\\media"
|
||||
echo
|
||||
echo "To verify services:"
|
||||
echo "- NFS: showmount -e ${NFS_HOST}"
|
||||
echo "- SMB: smbclient -L ${NFS_HOST} -N"
|
||||
echo "- Status: systemctl status nfs-server smbd"
|
||||
echo
|
||||
echo "Current NFS exports:"
|
||||
sudo exportfs -v
|
||||
echo
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
check_host
|
||||
install_nfs_server
|
||||
create_media_directory
|
||||
configure_nfs_exports
|
||||
start_nfs_services
|
||||
configure_smb_sharing
|
||||
start_smb_services
|
||||
test_nfs_setup
|
||||
show_usage_instructions
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
21
setup/cluster-services/nfs/wild-manifest.yaml
Normal file
21
setup/cluster-services/nfs/wild-manifest.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
name: nfs
|
||||
description: NFS client provisioner for external NFS storage
|
||||
namespace: nfs-system
|
||||
category: infrastructure
|
||||
|
||||
serviceConfig:
|
||||
nfsHost:
|
||||
path: cloud.nfs.host
|
||||
prompt: "Enter NFS server hostname or IP address"
|
||||
default: "192.168.1.100"
|
||||
type: string
|
||||
mediaPath:
|
||||
path: cloud.nfs.mediaPath
|
||||
prompt: "Enter NFS export path for media storage"
|
||||
default: "/mnt/storage/media"
|
||||
type: string
|
||||
storageCapacity:
|
||||
path: cloud.nfs.storageCapacity
|
||||
prompt: "Enter NFS storage capacity (e.g., 1Ti, 500Gi)"
|
||||
default: "1Ti"
|
||||
type: string
|
||||
52
setup/cluster-services/node-feature-discovery/install.sh
Executable file
52
setup/cluster-services/node-feature-discovery/install.sh
Executable file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
NFD_DIR="${CLUSTER_SETUP_DIR}/node-feature-discovery"
|
||||
|
||||
echo "🔧 === Setting up Node Feature Discovery ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled Node Feature Discovery templates..."
|
||||
if [ ! -d "${NFD_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${NFD_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying Node Feature Discovery..."
|
||||
kubectl apply -k "${NFD_DIR}/kustomize"
|
||||
|
||||
echo "⏳ Waiting for Node Feature Discovery DaemonSet to be ready..."
|
||||
kubectl rollout status daemonset/node-feature-discovery-worker -n node-feature-discovery --timeout=300s
|
||||
|
||||
echo ""
|
||||
echo "✅ Node Feature Discovery installed successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n node-feature-discovery"
|
||||
echo " kubectl get nodes --show-labels | grep feature.node.kubernetes.io"
|
||||
echo ""
|
||||
echo "🎮 GPU nodes should now be labeled with GPU device information:"
|
||||
echo " kubectl get nodes --show-labels | grep pci-10de"
|
||||
@@ -0,0 +1,711 @@
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.16.3
|
||||
name: nodefeatures.nfd.k8s-sigs.io
|
||||
spec:
|
||||
group: nfd.k8s-sigs.io
|
||||
names:
|
||||
kind: NodeFeature
|
||||
listKind: NodeFeatureList
|
||||
plural: nodefeatures
|
||||
singular: nodefeature
|
||||
scope: Namespaced
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: |-
|
||||
NodeFeature resource holds the features discovered for one node in the
|
||||
cluster.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: Specification of the NodeFeature, containing features discovered
|
||||
for a node.
|
||||
properties:
|
||||
features:
|
||||
description: Features is the full "raw" features data that has been
|
||||
discovered.
|
||||
properties:
|
||||
attributes:
|
||||
additionalProperties:
|
||||
description: AttributeFeatureSet is a set of features having
|
||||
string value.
|
||||
properties:
|
||||
elements:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Individual features of the feature set.
|
||||
type: object
|
||||
required:
|
||||
- elements
|
||||
type: object
|
||||
description: Attributes contains all the attribute-type features
|
||||
of the node.
|
||||
type: object
|
||||
flags:
|
||||
additionalProperties:
|
||||
description: FlagFeatureSet is a set of simple features only
|
||||
containing names without values.
|
||||
properties:
|
||||
elements:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
Nil is a dummy empty struct for protobuf compatibility.
|
||||
NOTE: protobuf definitions have been removed but this is kept for API compatibility.
|
||||
type: object
|
||||
description: Individual features of the feature set.
|
||||
type: object
|
||||
required:
|
||||
- elements
|
||||
type: object
|
||||
description: Flags contains all the flag-type features of the
|
||||
node.
|
||||
type: object
|
||||
instances:
|
||||
additionalProperties:
|
||||
description: InstanceFeatureSet is a set of features each of
|
||||
which is an instance having multiple attributes.
|
||||
properties:
|
||||
elements:
|
||||
description: Individual features of the feature set.
|
||||
items:
|
||||
description: InstanceFeature represents one instance of
|
||||
a complex features, e.g. a device.
|
||||
properties:
|
||||
attributes:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Attributes of the instance feature.
|
||||
type: object
|
||||
required:
|
||||
- attributes
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- elements
|
||||
type: object
|
||||
description: Instances contains all the instance-type features
|
||||
of the node.
|
||||
type: object
|
||||
type: object
|
||||
labels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Labels is the set of node labels that are requested to
|
||||
be created.
|
||||
type: object
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.16.3
|
||||
name: nodefeaturegroups.nfd.k8s-sigs.io
|
||||
spec:
|
||||
group: nfd.k8s-sigs.io
|
||||
names:
|
||||
kind: NodeFeatureGroup
|
||||
listKind: NodeFeatureGroupList
|
||||
plural: nodefeaturegroups
|
||||
shortNames:
|
||||
- nfg
|
||||
singular: nodefeaturegroup
|
||||
scope: Namespaced
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: NodeFeatureGroup resource holds Node pools by featureGroup
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: Spec defines the rules to be evaluated.
|
||||
properties:
|
||||
featureGroupRules:
|
||||
description: List of rules to evaluate to determine nodes that belong
|
||||
in this group.
|
||||
items:
|
||||
description: GroupRule defines a rule for nodegroup filtering.
|
||||
properties:
|
||||
matchAny:
|
||||
description: MatchAny specifies a list of matchers one of which
|
||||
must match.
|
||||
items:
|
||||
description: MatchAnyElem specifies one sub-matcher of MatchAny.
|
||||
properties:
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher
|
||||
terms all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature
|
||||
set to match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- matchFeatures
|
||||
type: object
|
||||
type: array
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher terms
|
||||
all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature set to
|
||||
match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
name:
|
||||
description: Name of the rule.
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- featureGroupRules
|
||||
type: object
|
||||
status:
|
||||
description: |-
|
||||
Status of the NodeFeatureGroup after the most recent evaluation of the
|
||||
specification.
|
||||
properties:
|
||||
nodes:
|
||||
description: Nodes is a list of FeatureGroupNode in the cluster that
|
||||
match the featureGroupRules
|
||||
items:
|
||||
properties:
|
||||
name:
|
||||
description: Name of the node.
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
type: array
|
||||
x-kubernetes-list-map-keys:
|
||||
- name
|
||||
x-kubernetes-list-type: map
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
subresources:
|
||||
status: {}
|
||||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.16.3
|
||||
name: nodefeaturerules.nfd.k8s-sigs.io
|
||||
spec:
|
||||
group: nfd.k8s-sigs.io
|
||||
names:
|
||||
kind: NodeFeatureRule
|
||||
listKind: NodeFeatureRuleList
|
||||
plural: nodefeaturerules
|
||||
shortNames:
|
||||
- nfr
|
||||
singular: nodefeaturerule
|
||||
scope: Cluster
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
description: |-
|
||||
NodeFeatureRule resource specifies a configuration for feature-based
|
||||
customization of node objects, such as node labeling.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: Spec defines the rules to be evaluated.
|
||||
properties:
|
||||
rules:
|
||||
description: Rules is a list of node customization rules.
|
||||
items:
|
||||
description: Rule defines a rule for node customization such as
|
||||
labeling.
|
||||
properties:
|
||||
annotations:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Annotations to create if the rule matches.
|
||||
type: object
|
||||
extendedResources:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: ExtendedResources to create if the rule matches.
|
||||
type: object
|
||||
labels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: Labels to create if the rule matches.
|
||||
type: object
|
||||
labelsTemplate:
|
||||
description: |-
|
||||
LabelsTemplate specifies a template to expand for dynamically generating
|
||||
multiple labels. Data (after template expansion) must be keys with an
|
||||
optional value (<key>[=<value>]) separated by newlines.
|
||||
type: string
|
||||
matchAny:
|
||||
description: MatchAny specifies a list of matchers one of which
|
||||
must match.
|
||||
items:
|
||||
description: MatchAnyElem specifies one sub-matcher of MatchAny.
|
||||
properties:
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher
|
||||
terms all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature
|
||||
set to match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- matchFeatures
|
||||
type: object
|
||||
type: array
|
||||
matchFeatures:
|
||||
description: MatchFeatures specifies a set of matcher terms
|
||||
all of which must match.
|
||||
items:
|
||||
description: |-
|
||||
FeatureMatcherTerm defines requirements against one feature set. All
|
||||
requirements (specified as MatchExpressions) are evaluated against each
|
||||
element in the feature set.
|
||||
properties:
|
||||
feature:
|
||||
description: Feature is the name of the feature set to
|
||||
match against.
|
||||
type: string
|
||||
matchExpressions:
|
||||
additionalProperties:
|
||||
description: |-
|
||||
MatchExpression specifies an expression to evaluate against a set of input
|
||||
values. It contains an operator that is applied when matching the input and
|
||||
an array of values that the operator evaluates the input against.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
description: |-
|
||||
MatchExpressions is the set of per-element expressions evaluated. These
|
||||
match against the value of the specified elements.
|
||||
type: object
|
||||
matchName:
|
||||
description: |-
|
||||
MatchName in an expression that is matched against the name of each
|
||||
element in the feature set.
|
||||
properties:
|
||||
op:
|
||||
description: Op is the operator to be applied.
|
||||
enum:
|
||||
- In
|
||||
- NotIn
|
||||
- InRegexp
|
||||
- Exists
|
||||
- DoesNotExist
|
||||
- Gt
|
||||
- Lt
|
||||
- GtLt
|
||||
- IsTrue
|
||||
- IsFalse
|
||||
type: string
|
||||
value:
|
||||
description: |-
|
||||
Value is the list of values that the operand evaluates the input
|
||||
against. Value should be empty if the operator is Exists, DoesNotExist,
|
||||
IsTrue or IsFalse. Value should contain exactly one element if the
|
||||
operator is Gt or Lt and exactly two elements if the operator is GtLt.
|
||||
In other cases Value should contain at least one element.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- op
|
||||
type: object
|
||||
required:
|
||||
- feature
|
||||
type: object
|
||||
type: array
|
||||
name:
|
||||
description: Name of the rule.
|
||||
type: string
|
||||
taints:
|
||||
description: Taints to create if the rule matches.
|
||||
items:
|
||||
description: |-
|
||||
The node this Taint is attached to has the "effect" on
|
||||
any pod that does not tolerate the Taint.
|
||||
properties:
|
||||
effect:
|
||||
description: |-
|
||||
Required. The effect of the taint on pods
|
||||
that do not tolerate the taint.
|
||||
Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
|
||||
type: string
|
||||
key:
|
||||
description: Required. The taint key to be applied to
|
||||
a node.
|
||||
type: string
|
||||
timeAdded:
|
||||
description: |-
|
||||
TimeAdded represents the time at which the taint was added.
|
||||
It is only written for NoExecute taints.
|
||||
format: date-time
|
||||
type: string
|
||||
value:
|
||||
description: The taint value corresponding to the taint
|
||||
key.
|
||||
type: string
|
||||
required:
|
||||
- effect
|
||||
- key
|
||||
type: object
|
||||
type: array
|
||||
vars:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: |-
|
||||
Vars is the variables to store if the rule matches. Variables do not
|
||||
directly inflict any changes in the node object. However, they can be
|
||||
referenced from other rules enabling more complex rule hierarchies,
|
||||
without exposing intermediary output values as labels.
|
||||
type: object
|
||||
varsTemplate:
|
||||
description: |-
|
||||
VarsTemplate specifies a template to expand for dynamically generating
|
||||
multiple variables. Data (after template expansion) must be keys with an
|
||||
optional value (<key>[=<value>]) separated by newlines.
|
||||
type: string
|
||||
required:
|
||||
- name
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- rules
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
@@ -0,0 +1,86 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-feature-discovery-worker
|
||||
namespace: node-feature-discovery
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: node-feature-discovery-worker
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: node-feature-discovery-worker
|
||||
spec:
|
||||
serviceAccountName: node-feature-discovery
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: worker
|
||||
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
resources:
|
||||
limits:
|
||||
memory: 512Mi
|
||||
requests:
|
||||
cpu: 5m
|
||||
memory: 64Mi
|
||||
command:
|
||||
- "nfd-worker"
|
||||
args:
|
||||
- "-metrics=8081"
|
||||
- "-grpc-health=8082"
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
name: metrics
|
||||
- containerPort: 8082
|
||||
name: health
|
||||
volumeMounts:
|
||||
- name: host-boot
|
||||
mountPath: "/host-boot"
|
||||
readOnly: true
|
||||
- name: host-os-release
|
||||
mountPath: "/host-etc/os-release"
|
||||
readOnly: true
|
||||
- name: host-sys
|
||||
mountPath: "/host-sys"
|
||||
readOnly: true
|
||||
- name: host-usr-lib
|
||||
mountPath: "/host-usr/lib"
|
||||
readOnly: true
|
||||
- name: host-lib
|
||||
mountPath: "/host-lib"
|
||||
readOnly: true
|
||||
- name: host-proc-swaps
|
||||
mountPath: "/host-proc/swaps"
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: host-boot
|
||||
hostPath:
|
||||
path: "/boot"
|
||||
- name: host-os-release
|
||||
hostPath:
|
||||
path: "/etc/os-release"
|
||||
- name: host-sys
|
||||
hostPath:
|
||||
path: "/sys"
|
||||
- name: host-usr-lib
|
||||
hostPath:
|
||||
path: "/usr/lib"
|
||||
- name: host-lib
|
||||
hostPath:
|
||||
path: "/lib"
|
||||
- name: host-proc-swaps
|
||||
hostPath:
|
||||
path: "/proc/swaps"
|
||||
@@ -0,0 +1,14 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: node-feature-discovery
|
||||
labels:
|
||||
- pairs:
|
||||
app.kubernetes.io/name: node-feature-discovery
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- crds.yaml
|
||||
- rbac.yaml
|
||||
- daemonset.yaml
|
||||
- master.yaml
|
||||
@@ -0,0 +1,49 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: node-feature-discovery-master
|
||||
namespace: node-feature-discovery
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
name: node-feature-discovery-master
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: node-feature-discovery-master
|
||||
spec:
|
||||
serviceAccountName: node-feature-discovery
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: master
|
||||
image: registry.k8s.io/nfd/node-feature-discovery:v0.17.3
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
command:
|
||||
- "nfd-master"
|
||||
args:
|
||||
- "-metrics=8081"
|
||||
- "-grpc-health=8082"
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
name: metrics
|
||||
- containerPort: 8082
|
||||
name: health
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
memory: 128Mi
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
labels:
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
pod-security.kubernetes.io/audit: privileged
|
||||
pod-security.kubernetes.io/warn: privileged
|
||||
@@ -0,0 +1,55 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
namespace: node-feature-discovery
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/status
|
||||
verbs:
|
||||
- get
|
||||
- patch
|
||||
- update
|
||||
- list
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- namespaces
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- nfd.k8s-sigs.io
|
||||
resources:
|
||||
- nodefeatures
|
||||
- nodefeaturerules
|
||||
- nodefeaturegroups
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- create
|
||||
- update
|
||||
- patch
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: node-feature-discovery
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: node-feature-discovery
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-feature-discovery
|
||||
namespace: node-feature-discovery
|
||||
@@ -0,0 +1,4 @@
|
||||
name: node-feature-discovery
|
||||
description: Detects hardware features available on each node
|
||||
namespace: node-feature-discovery
|
||||
category: infrastructure
|
||||
98
setup/cluster-services/nvidia-device-plugin/README.md
Normal file
98
setup/cluster-services/nvidia-device-plugin/README.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# NVIDIA Device Plugin
|
||||
|
||||
The NVIDIA Device Plugin for Kubernetes enables GPU scheduling and resource management on nodes with NVIDIA GPUs.
|
||||
|
||||
## Overview
|
||||
|
||||
This service deploys the official NVIDIA Device Plugin as a DaemonSet that:
|
||||
- Discovers NVIDIA GPUs on worker nodes
|
||||
- Labels nodes with GPU product information (e.g., `nvidia.com/gpu.product=GeForce-RTX-4090`)
|
||||
- Advertises GPU resources (`nvidia.com/gpu`) to the Kubernetes scheduler
|
||||
- Enables pods to request GPU resources
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before installing the NVIDIA Device Plugin, ensure that:
|
||||
|
||||
1. **NVIDIA Drivers** are installed (>= 384.81)
|
||||
2. **nvidia-container-toolkit** is installed (>= 1.7.0)
|
||||
3. **nvidia-container-runtime** is configured as the default container runtime
|
||||
4. Worker nodes have NVIDIA GPUs
|
||||
|
||||
### Talos Linux Requirements
|
||||
|
||||
For Talos Linux nodes, you need:
|
||||
- NVIDIA drivers extension in the Talos schematic
|
||||
- nvidia-container-toolkit extension
|
||||
- Proper container runtime configuration
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Configure and install the service
|
||||
wild-cluster-services-configure nvidia-device-plugin
|
||||
wild-cluster-install nvidia-device-plugin
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
After installation, verify the plugin is working:
|
||||
|
||||
```bash
|
||||
# Check plugin pods are running
|
||||
kubectl get pods -n kube-system | grep nvidia
|
||||
|
||||
# Verify GPU resources are advertised
|
||||
kubectl get nodes -o json | jq '.items[].status.capacity | select(has("nvidia.com/gpu"))'
|
||||
|
||||
# Check GPU node labels
|
||||
kubectl get nodes --show-labels | grep nvidia
|
||||
```
|
||||
|
||||
## Usage in Applications
|
||||
|
||||
Once installed, applications can request GPU resources:
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gpu-app
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: nvidia/cuda:latest
|
||||
resources:
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Plugin Not Starting
|
||||
- Verify NVIDIA drivers are installed on worker nodes
|
||||
- Check that nvidia-container-toolkit is properly configured
|
||||
- Ensure worker nodes are not tainted in a way that prevents scheduling
|
||||
|
||||
### No GPU Resources Advertised
|
||||
- Check plugin logs: `kubectl logs -n kube-system -l name=nvidia-device-plugin-ds`
|
||||
- Verify NVIDIA runtime is the default container runtime
|
||||
- Ensure GPUs are detected by the driver: check node logs for GPU detection messages
|
||||
|
||||
## Configuration
|
||||
|
||||
The plugin uses the following configuration:
|
||||
- **Image**: `nvcr.io/nvidia/k8s-device-plugin:v0.17.1`
|
||||
- **Namespace**: `kube-system`
|
||||
- **Priority Class**: `system-node-critical`
|
||||
- **Tolerations**: Schedules on nodes with `nvidia.com/gpu` taint
|
||||
|
||||
## References
|
||||
|
||||
- [Official NVIDIA Device Plugin Repository](https://github.com/NVIDIA/k8s-device-plugin)
|
||||
- [Kubernetes GPU Scheduling Documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/)
|
||||
- [NVIDIA Container Toolkit Documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/)
|
||||
66
setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
66
setup/cluster-services/nvidia-device-plugin/install.sh
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
NVIDIA_PLUGIN_DIR="${CLUSTER_SETUP_DIR}/nvidia-device-plugin"
|
||||
|
||||
echo "🎮 === Setting up NVIDIA Device Plugin ==="
|
||||
echo ""
|
||||
|
||||
# Check if we have NVIDIA GPUs in the cluster
|
||||
echo "🔍 Checking for worker nodes in the cluster..."
|
||||
|
||||
# Check if any worker nodes exist (device plugin only runs on worker nodes)
|
||||
WORKER_NODES=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o name | wc -l)
|
||||
if [ "$WORKER_NODES" -eq 0 ]; then
|
||||
echo "❌ ERROR: No worker nodes found in cluster. NVIDIA Device Plugin requires worker nodes."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Found $WORKER_NODES worker node(s)"
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled NVIDIA Device Plugin templates..."
|
||||
if [ ! -d "${NVIDIA_PLUGIN_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${NVIDIA_PLUGIN_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Deploying NVIDIA Device Plugin..."
|
||||
kubectl apply -k ${NVIDIA_PLUGIN_DIR}/kustomize
|
||||
|
||||
echo "⏳ Waiting for NVIDIA Device Plugin DaemonSet to be ready..."
|
||||
kubectl rollout status daemonset/nvidia-device-plugin-daemonset -n kube-system --timeout=120s
|
||||
|
||||
echo ""
|
||||
echo "✅ NVIDIA Device Plugin installed successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n kube-system | grep nvidia"
|
||||
echo " kubectl get nodes -o json | jq '.items[].status.capacity | select(has(\"nvidia.com/gpu\"))'"
|
||||
echo ""
|
||||
echo "🎮 GPU nodes should now be labeled with GPU product information:"
|
||||
echo " kubectl get nodes --show-labels | grep nvidia"
|
||||
echo ""
|
||||
@@ -0,0 +1,91 @@
|
||||
# NVIDIA Device Plugin DaemonSet
|
||||
# Based on official manifest from: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.1/deployments/static/nvidia-device-plugin.yml
|
||||
# Licensed under the Apache License, Version 2.0
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin-daemonset
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: nvidia-device-plugin-ds
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
spec:
|
||||
runtimeClassName: nvidia
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: feature.node.kubernetes.io/pci-0300_10de.present
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
# Mark this pod as a critical add-on; when enabled, the critical add-on
|
||||
# scheduler reserves resources for critical add-on pods so that they can
|
||||
# be rescheduled after a failure.
|
||||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
|
||||
priorityClassName: "system-node-critical"
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
|
||||
name: nvidia-device-plugin-ctr
|
||||
env:
|
||||
- name: MPS_ROOT
|
||||
value: /run/nvidia/mps
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: all
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: compute,utility
|
||||
- name: FAIL_ON_INIT_ERROR
|
||||
value: "false"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: mps-shm
|
||||
mountPath: /dev/shm
|
||||
- name: mps-root
|
||||
mountPath: /mps
|
||||
- name: cdi-root
|
||||
mountPath: /var/run/cdi
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: mps-root
|
||||
hostPath:
|
||||
path: /run/nvidia/mps
|
||||
type: DirectoryOrCreate
|
||||
- name: mps-shm
|
||||
hostPath:
|
||||
path: /run/nvidia/mps/shm
|
||||
- name: cdi-root
|
||||
hostPath:
|
||||
path: /var/run/cdi
|
||||
type: DirectoryOrCreate
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: kube-system
|
||||
resources:
|
||||
- daemonset.yaml
|
||||
- runtimeclass.yaml
|
||||
labels:
|
||||
- pairs:
|
||||
app.kubernetes.io/name: nvidia-device-plugin
|
||||
app.kubernetes.io/component: device-plugin
|
||||
managedBy: kustomize
|
||||
partOf: wild-cloud
|
||||
@@ -0,0 +1,5 @@
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia
|
||||
handler: nvidia
|
||||
@@ -0,0 +1,7 @@
|
||||
name: nvidia-device-plugin
|
||||
description: NVIDIA device plugin for Kubernetes
|
||||
namespace: nvidia-device-plugin
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- node-feature-discovery
|
||||
51
setup/cluster-services/smtp/README.md
Normal file
51
setup/cluster-services/smtp/README.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# SMTP Configuration Service
|
||||
|
||||
This service configures SMTP settings for Wild Cloud applications to send transactional emails.
|
||||
|
||||
## Overview
|
||||
|
||||
The SMTP service doesn't deploy any Kubernetes resources. Instead, it helps configure global SMTP settings that can be used by Wild Cloud applications like Ghost, Gitea, and others for sending:
|
||||
|
||||
- Password reset emails
|
||||
- User invitation emails
|
||||
- Notification emails
|
||||
- Other transactional emails
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
./setup/cluster-services/smtp/install.sh
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The setup script will prompt for:
|
||||
|
||||
- **SMTP Host**: Your email provider's SMTP server (e.g., `email-smtp.us-east-2.amazonaws.com` for AWS SES)
|
||||
- **SMTP Port**: Usually `465` for SSL or `587` for STARTTLS
|
||||
- **SMTP User**: Username or access key for authentication
|
||||
- **From Address**: Default sender email address
|
||||
- **SMTP Password**: Your password, secret key, or API key (entered securely)
|
||||
|
||||
## Supported Providers
|
||||
|
||||
- **AWS SES**: Use your Access Key ID as user and Secret Access Key as password
|
||||
- **Gmail/Google Workspace**: Use your email as user and an App Password as password
|
||||
- **SendGrid**: Use `apikey` as user and your API key as password
|
||||
- **Mailgun**: Use your Mailgun username and password
|
||||
- **Other SMTP providers**: Use your standard SMTP credentials
|
||||
|
||||
## Applications That Use SMTP
|
||||
|
||||
- **Ghost**: User management, password resets, notifications
|
||||
- **Gitea**: User registration, password resets, notifications
|
||||
- **OpenProject**: User invitations, notifications
|
||||
- **Future applications**: Any app that needs to send emails
|
||||
|
||||
## Testing
|
||||
|
||||
After configuration, test SMTP by:
|
||||
|
||||
1. Deploying an application that uses email (like Ghost)
|
||||
2. Using password reset or user invitation features
|
||||
3. Checking application logs for SMTP connection issues
|
||||
36
setup/cluster-services/smtp/wild-manifest.yaml
Normal file
36
setup/cluster-services/smtp/wild-manifest.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
name: smtp
|
||||
description: SMTP relay service for cluster applications
|
||||
namespace: smtp-system
|
||||
category: infrastructure
|
||||
|
||||
serviceConfig:
|
||||
smtpHost:
|
||||
path: cloud.smtp.host
|
||||
prompt: "Enter SMTP host (e.g., email-smtp.us-east-2.amazonaws.com for AWS SES)"
|
||||
default: ""
|
||||
type: string
|
||||
smtpPort:
|
||||
path: cloud.smtp.port
|
||||
prompt: "Enter SMTP port (usually 465 for SSL, 587 for STARTTLS)"
|
||||
default: "465"
|
||||
type: string
|
||||
smtpUser:
|
||||
path: cloud.smtp.user
|
||||
prompt: "Enter SMTP username/access key"
|
||||
default: ""
|
||||
type: string
|
||||
smtpFrom:
|
||||
path: cloud.smtp.from
|
||||
prompt: "Enter default 'from' email address"
|
||||
default: "no-reply@{{ .cloud.domain }}"
|
||||
type: string
|
||||
smtpTls:
|
||||
path: cloud.smtp.tls
|
||||
prompt: "Enable TLS? (true/false)"
|
||||
default: "true"
|
||||
type: string
|
||||
smtpStartTls:
|
||||
path: cloud.smtp.startTls
|
||||
prompt: "Enable STARTTLS? (true/false)"
|
||||
default: "true"
|
||||
type: string
|
||||
31
setup/cluster-services/traefik/README.md
Normal file
31
setup/cluster-services/traefik/README.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Traefik
|
||||
|
||||
- https://doc.traefik.io/traefik/providers/kubernetes-ingress/
|
||||
|
||||
Ingress RDs can be create for any service. The routes specificed in the Ingress are added automatically to the Traefik proxy.
|
||||
|
||||
Traefik serves all incoming network traffic on ports 80 and 443 to their appropriate services based on the route.
|
||||
|
||||
## Notes
|
||||
|
||||
These kustomize templates were created with:
|
||||
|
||||
```bash
|
||||
helm-chart-to-kustomize traefik/traefik traefik traefik values.yaml
|
||||
```
|
||||
|
||||
With values.yaml being:
|
||||
|
||||
```yaml
|
||||
ingressRoute:
|
||||
dashboard:
|
||||
enabled: true
|
||||
matchRule: Host(`dashboard.localhost`)
|
||||
entryPoints:
|
||||
- web
|
||||
providers:
|
||||
kubernetesGateway:
|
||||
enabled: true
|
||||
gateway:
|
||||
namespacePolicy: All
|
||||
```
|
||||
72
setup/cluster-services/traefik/install.sh
Executable file
72
setup/cluster-services/traefik/install.sh
Executable file
@@ -0,0 +1,72 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
TRAEFIK_DIR="${CLUSTER_SETUP_DIR}/traefik"
|
||||
|
||||
echo "🌐 === Setting up Traefik Ingress Controller ==="
|
||||
echo ""
|
||||
|
||||
# Check MetalLB dependency
|
||||
echo "🔍 Verifying MetalLB is ready (required for Traefik LoadBalancer service)..."
|
||||
kubectl wait --for=condition=Ready pod -l component=controller -n metallb-system --timeout=60s 2>/dev/null || {
|
||||
echo "⚠️ MetalLB controller not ready, but continuing with Traefik installation"
|
||||
echo "💡 Note: Traefik LoadBalancer service may not get external IP without MetalLB"
|
||||
}
|
||||
|
||||
# Install required CRDs first
|
||||
echo "📦 Installing Gateway API CRDs..."
|
||||
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml
|
||||
|
||||
echo "📦 Installing Traefik CRDs..."
|
||||
kubectl apply -f https://raw.githubusercontent.com/traefik/traefik/v3.4/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml
|
||||
|
||||
echo "⏳ Waiting for CRDs to be established..."
|
||||
kubectl wait --for condition=established crd/gateways.gateway.networking.k8s.io --timeout=60s
|
||||
kubectl wait --for condition=established crd/gatewayclasses.gateway.networking.k8s.io --timeout=60s
|
||||
kubectl wait --for condition=established crd/ingressroutes.traefik.io --timeout=60s
|
||||
kubectl wait --for condition=established crd/middlewares.traefik.io --timeout=60s
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled Traefik templates..."
|
||||
if [ ! -d "${TRAEFIK_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${TRAEFIK_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Apply Traefik using kustomize
|
||||
echo "🚀 Deploying Traefik..."
|
||||
kubectl apply -k ${TRAEFIK_DIR}/kustomize
|
||||
|
||||
# Wait for Traefik to be ready
|
||||
echo "⏳ Waiting for Traefik to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=120s
|
||||
|
||||
echo ""
|
||||
echo "✅ Traefik installed successfully"
|
||||
echo ""
|
||||
echo "💡 To verify the installation:"
|
||||
echo " kubectl get pods -n traefik"
|
||||
echo " kubectl get svc -n traefik"
|
||||
echo ""
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: traefik.containo.us/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: internal-only
|
||||
namespace: kube-system
|
||||
spec:
|
||||
ipWhiteList:
|
||||
# Restrict to local private network ranges - adjust these to match your network
|
||||
sourceRange:
|
||||
- 127.0.0.1/32 # localhost
|
||||
- 10.0.0.0/8 # Private network
|
||||
- 172.16.0.0/12 # Private network
|
||||
- 192.168.0.0/16 # Private network
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- templates/deployment.yaml
|
||||
- templates/gatewayclass.yaml
|
||||
- templates/gateway.yaml
|
||||
- templates/ingressclass.yaml
|
||||
- templates/ingressroute.yaml
|
||||
- templates/rbac/clusterrolebinding.yaml
|
||||
- templates/rbac/clusterrole.yaml
|
||||
- templates/rbac/serviceaccount.yaml
|
||||
- templates/service.yaml
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: traefik
|
||||
@@ -0,0 +1,130 @@
|
||||
---
|
||||
# Source: traefik/templates/deployment.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
annotations:
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 0
|
||||
maxSurge: 1
|
||||
minReadySeconds: 0
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/path: "/metrics"
|
||||
prometheus.io/port: "9100"
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
spec:
|
||||
serviceAccountName: traefik
|
||||
automountServiceAccountToken: true
|
||||
terminationGracePeriodSeconds: 60
|
||||
hostNetwork: false
|
||||
containers:
|
||||
- image: docker.io/traefik:v3.4.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: traefik
|
||||
resources:
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ping
|
||||
port: 8080
|
||||
scheme: HTTP
|
||||
failureThreshold: 1
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 2
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ping
|
||||
port: 8080
|
||||
scheme: HTTP
|
||||
failureThreshold: 3
|
||||
initialDelaySeconds: 2
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 2
|
||||
lifecycle:
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9100
|
||||
protocol: TCP
|
||||
- name: traefik
|
||||
containerPort: 8080
|
||||
protocol: TCP
|
||||
- name: web
|
||||
containerPort: 8000
|
||||
protocol: TCP
|
||||
- name: websecure
|
||||
containerPort: 8443
|
||||
protocol: TCP
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
args:
|
||||
- "--global.checkNewVersion"
|
||||
- "--entryPoints.metrics.address=:9100/tcp"
|
||||
- "--entryPoints.traefik.address=:8080/tcp"
|
||||
- "--entryPoints.web.address=:8000/tcp"
|
||||
- "--entryPoints.websecure.address=:8443/tcp"
|
||||
- "--api.dashboard=true"
|
||||
- "--ping=true"
|
||||
- "--metrics.prometheus=true"
|
||||
- "--metrics.prometheus.entrypoint=metrics"
|
||||
- "--providers.kubernetescrd"
|
||||
- "--providers.kubernetescrd.allowEmptyServices=true"
|
||||
- "--providers.kubernetesingress"
|
||||
- "--providers.kubernetesingress.allowEmptyServices=true"
|
||||
- "--providers.kubernetesingress.ingressendpoint.publishedservice=traefik/traefik"
|
||||
- "--providers.kubernetesgateway"
|
||||
- "--providers.kubernetesgateway.statusaddress.service.name=traefik"
|
||||
- "--providers.kubernetesgateway.statusaddress.service.namespace=traefik"
|
||||
- "--entryPoints.websecure.http.tls=true"
|
||||
- "--log.level=INFO"
|
||||
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
volumes:
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
- name: tmp
|
||||
emptyDir: {}
|
||||
securityContext:
|
||||
runAsGroup: 65532
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
@@ -0,0 +1,18 @@
|
||||
---
|
||||
# Source: traefik/templates/gateway.yaml
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: Gateway
|
||||
metadata:
|
||||
name: traefik-gateway
|
||||
namespace: traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
spec:
|
||||
gatewayClassName: traefik
|
||||
listeners:
|
||||
- name: web
|
||||
port: 8000
|
||||
protocol: HTTP
|
||||
@@ -0,0 +1,13 @@
|
||||
---
|
||||
# Source: traefik/templates/gatewayclass.yaml
|
||||
apiVersion: gateway.networking.k8s.io/v1
|
||||
kind: GatewayClass
|
||||
metadata:
|
||||
name: traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
spec:
|
||||
controllerName: traefik.io/gateway-controller
|
||||
@@ -0,0 +1,15 @@
|
||||
---
|
||||
# Source: traefik/templates/ingressclass.yaml
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: IngressClass
|
||||
metadata:
|
||||
annotations:
|
||||
ingressclass.kubernetes.io/is-default-class: "true"
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
name: traefik
|
||||
spec:
|
||||
controller: traefik.io/ingress-controller
|
||||
@@ -0,0 +1,21 @@
|
||||
---
|
||||
# Source: traefik/templates/ingressroute.yaml
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
metadata:
|
||||
name: traefik-dashboard
|
||||
namespace: traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
spec:
|
||||
entryPoints:
|
||||
- web
|
||||
routes:
|
||||
- match: Host(`dashboard.localhost`)
|
||||
kind: Rule
|
||||
services:
|
||||
- kind: TraefikService
|
||||
name: api@internal
|
||||
@@ -0,0 +1,108 @@
|
||||
---
|
||||
# Source: traefik/templates/rbac/clusterrole.yaml
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: traefik-traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- configmaps
|
||||
- nodes
|
||||
- services
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- discovery.k8s.io
|
||||
resources:
|
||||
- endpointslices
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- secrets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- extensions
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingressclasses
|
||||
- ingresses
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- extensions
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingresses/status
|
||||
verbs:
|
||||
- update
|
||||
- apiGroups:
|
||||
- traefik.io
|
||||
resources:
|
||||
- ingressroutes
|
||||
- ingressroutetcps
|
||||
- ingressrouteudps
|
||||
- middlewares
|
||||
- middlewaretcps
|
||||
- serverstransports
|
||||
- serverstransporttcps
|
||||
- tlsoptions
|
||||
- tlsstores
|
||||
- traefikservices
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- namespaces
|
||||
- secrets
|
||||
- configmaps
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- gateway.networking.k8s.io
|
||||
resources:
|
||||
- backendtlspolicies
|
||||
- gatewayclasses
|
||||
- gateways
|
||||
- grpcroutes
|
||||
- httproutes
|
||||
- referencegrants
|
||||
- tcproutes
|
||||
- tlsroutes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- gateway.networking.k8s.io
|
||||
resources:
|
||||
- backendtlspolicies/status
|
||||
- gatewayclasses/status
|
||||
- gateways/status
|
||||
- grpcroutes/status
|
||||
- httproutes/status
|
||||
- tcproutes/status
|
||||
- tlsroutes/status
|
||||
verbs:
|
||||
- update
|
||||
@@ -0,0 +1,19 @@
|
||||
---
|
||||
# Source: traefik/templates/rbac/clusterrolebinding.yaml
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: traefik-traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: traefik-traefik
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: traefik
|
||||
namespace: traefik
|
||||
@@ -0,0 +1,14 @@
|
||||
---
|
||||
# Source: traefik/templates/rbac/serviceaccount.yaml
|
||||
kind: ServiceAccount
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
annotations:
|
||||
automountServiceAccountToken: false
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
# Source: traefik/templates/service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: traefik
|
||||
labels:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
helm.sh/chart: traefik-36.1.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
annotations:
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
selector:
|
||||
app.kubernetes.io/name: traefik
|
||||
app.kubernetes.io/instance: traefik-traefik
|
||||
ports:
|
||||
- port: 80
|
||||
name: web
|
||||
targetPort: web
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
name: websecure
|
||||
targetPort: websecure
|
||||
protocol: TCP
|
||||
@@ -0,0 +1,28 @@
|
||||
---
|
||||
# Traefik service configuration with static LoadBalancer IP
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: kube-system
|
||||
annotations:
|
||||
# Get a stable IP from MetalLB
|
||||
metallb.universe.tf/address-pool: production
|
||||
metallb.universe.tf/allow-shared-ip: traefik-lb
|
||||
labels:
|
||||
app.kubernetes.io/instance: traefik-kube-system
|
||||
app.kubernetes.io/name: traefik
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerIP: {{ .cluster.loadBalancerIp }}
|
||||
selector:
|
||||
app.kubernetes.io/instance: traefik-kube-system
|
||||
app.kubernetes.io/name: traefik
|
||||
ports:
|
||||
- name: web
|
||||
port: 80
|
||||
targetPort: web
|
||||
- name: websecure
|
||||
port: 443
|
||||
targetPort: websecure
|
||||
externalTrafficPolicy: Local
|
||||
10
setup/cluster-services/traefik/wild-manifest.yaml
Normal file
10
setup/cluster-services/traefik/wild-manifest.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
name: traefik
|
||||
description: Cloud-native reverse proxy and ingress controller
|
||||
namespace: traefik
|
||||
category: infrastructure
|
||||
|
||||
dependencies:
|
||||
- metallb
|
||||
|
||||
configReferences:
|
||||
- cluster.loadBalancerIp
|
||||
0
setup/cluster-services/utils/README.md
Normal file
0
setup/cluster-services/utils/README.md
Normal file
44
setup/cluster-services/utils/install.sh
Executable file
44
setup/cluster-services/utils/install.sh
Executable file
@@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Ensure WILD_INSTANCE is set
|
||||
if [ -z "${WILD_INSTANCE}" ]; then
|
||||
echo "❌ ERROR: WILD_INSTANCE is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure WILD_CENTRAL_DATA is set
|
||||
if [ -z "${WILD_CENTRAL_DATA}" ]; then
|
||||
echo "❌ ERROR: WILD_CENTRAL_DATA is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure KUBECONFIG is set
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
echo "❌ ERROR: KUBECONFIG is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INSTANCE_DIR="${WILD_CENTRAL_DATA}/instances/${WILD_INSTANCE}"
|
||||
CLUSTER_SETUP_DIR="${INSTANCE_DIR}/setup/cluster-services"
|
||||
UTILS_DIR="${CLUSTER_SETUP_DIR}/utils"
|
||||
|
||||
echo "🔧 === Setting up Cluster Utilities ==="
|
||||
echo ""
|
||||
|
||||
# Templates should already be compiled
|
||||
echo "📦 Using pre-compiled utils templates..."
|
||||
if [ ! -d "${UTILS_DIR}/kustomize" ]; then
|
||||
echo "❌ ERROR: Compiled templates not found at ${UTILS_DIR}/kustomize"
|
||||
echo "Templates should be compiled before deployment."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🚀 Applying utility manifests..."
|
||||
kubectl apply -f ${UTILS_DIR}/kustomize/
|
||||
|
||||
echo ""
|
||||
echo "✅ Cluster utilities installed successfully"
|
||||
echo ""
|
||||
echo "💡 Utility resources have been deployed to the cluster"
|
||||
@@ -0,0 +1,71 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: debug
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: netdebug
|
||||
namespace: debug
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: netdebug
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: netdebug
|
||||
namespace: debug
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: cluster-admin
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: netdebug
|
||||
namespace: debug
|
||||
labels:
|
||||
app: netdebug
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: netdebug
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: netdebug
|
||||
spec:
|
||||
serviceAccountName: netdebug
|
||||
containers:
|
||||
- name: netdebug
|
||||
image: nicolaka/netshoot:latest
|
||||
command: ["/bin/bash"]
|
||||
args: ["-c", "while true; do sleep 3600; done"]
|
||||
resources:
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
securityContext:
|
||||
privileged: true
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: netdebug
|
||||
namespace: debug
|
||||
spec:
|
||||
selector:
|
||||
app: netdebug
|
||||
ports:
|
||||
- port: 22
|
||||
targetPort: 22
|
||||
name: ssh
|
||||
type: ClusterIP
|
||||
4
setup/cluster-services/utils/wild-manifest.yaml
Normal file
4
setup/cluster-services/utils/wild-manifest.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
name: utils
|
||||
description: Utility tools and scripts for cluster administration
|
||||
namespace: utils-system
|
||||
category: infrastructure
|
||||
1
setup/dnsmasq/.gitignore
vendored
Normal file
1
setup/dnsmasq/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
setup-bundle/
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user