Adds reliability checks to cert-manager, externaldns, and traefik setup.

This commit is contained in:
2025-09-28 15:27:57 -07:00
parent 4d04b54f64
commit fb76eb5b47
5 changed files with 98 additions and 23 deletions

View File

@@ -7,4 +7,5 @@ domain=$(wild-config "cloud.domain")
prompt_if_unset_config "cloud.internalDomain" "Enter internal domain name" "local.${domain}" prompt_if_unset_config "cloud.internalDomain" "Enter internal domain name" "local.${domain}"
prompt_if_unset_config "operator.email" "Enter operator email address (for Let's Encrypt)" "" prompt_if_unset_config "operator.email" "Enter operator email address (for Let's Encrypt)" ""
prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain (for DNS challenges)" "${domain}" prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain (for DNS challenges)" "${domain}"
prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID (for DNS challenges - improves reliability)" ""
prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token (for DNS challenges)" "" prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token (for DNS challenges)" ""

View File

@@ -16,42 +16,88 @@ CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
print_header "Setting up cert-manager" print_header "Setting up cert-manager"
# Templates should already be compiled by wild-cluster-services-generate # Check Traefik dependency
echo "Using pre-compiled cert-manager templates..." print_info "Verifying Traefik is ready (required for cert-manager)..."
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
print_warning "Traefik not ready, but continuing with cert-manager installation"
print_info "Note: cert-manager may not work properly without Traefik"
}
# Templates should already be compiled by wild-cluster-services-configure
print_info "Using pre-compiled cert-manager templates..."
if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first." print_error "Compiled templates not found. Run 'wild-cluster-services-configure' first."
exit 1 exit 1
fi fi
echo "Setting up cert-manager..." print_info "Setting up cert-manager..."
# Install cert-manager using the official installation method # Install cert-manager using the official installation method
# This installs CRDs, controllers, and webhook components # This installs CRDs, controllers, and webhook components
echo "Installing cert-manager components..." print_info "Installing cert-manager components..."
# Using stable URL for cert-manager installation # Using stable URL for cert-manager installation
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \ kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
# Wait for cert-manager to be ready # Wait for cert-manager to be ready
echo "Waiting for cert-manager to be ready..." print_info "Waiting for cert-manager to be ready..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
# Add delay to allow webhook to be fully ready # Ensure webhook is fully operational
echo "Waiting additional time for cert-manager webhook to be fully operational..." print_info "Verifying cert-manager webhook is fully operational..."
sleep 30 until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
print_info "Waiting for cert-manager webhook to register..."
sleep 5
done
# Test webhook connectivity before proceeding
print_info "Testing webhook connectivity..."
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager
# Setup Cloudflare API token for DNS01 challenges # Setup Cloudflare API token for DNS01 challenges
echo "Creating Cloudflare API token secret..." print_info "Creating Cloudflare API token secret..."
CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1 CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1
# Validate Cloudflare API token permissions
print_info "Validating Cloudflare API token permissions..."
validate_cloudflare_token() {
local token="$1"
if ! command -v curl &>/dev/null; then
print_warning "curl not available, skipping token validation"
return 0
fi
print_info "Testing Cloudflare API token..."
local response
response=$(curl -s -H "Authorization: Bearer $token" \
"https://api.cloudflare.com/client/v4/zones")
if echo "$response" | grep -q '"success":true'; then
print_success "Cloudflare API token is valid and has zone access"
return 0
else
print_error "Cloudflare token validation failed"
print_info "Response: $response"
print_info "Please ensure your token has Zone - Zone - Read permission"
return 1
fi
}
validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || {
print_error "Cloudflare token validation failed. Please check token permissions."
print_info "Required permissions: Zone - Zone - Read, Zone - DNS - Edit"
exit 1
}
kubectl create secret generic cloudflare-api-token \ kubectl create secret generic cloudflare-api-token \
--namespace cert-manager \ --namespace cert-manager \
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \ --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
--dry-run=client -o yaml | kubectl apply -f - --dry-run=client -o yaml | kubectl apply -f -
# Configure cert-manager to use external DNS for challenge verification # Configure cert-manager to use external DNS for challenge verification
echo "Configuring cert-manager to use external DNS servers..." print_info "Configuring cert-manager to use external DNS servers..."
kubectl patch deployment cert-manager -n cert-manager --patch ' kubectl patch deployment cert-manager -n cert-manager --patch '
spec: spec:
template: template:
@@ -70,26 +116,39 @@ spec:
value: "5"' value: "5"'
# Wait for cert-manager to restart with new DNS config # Wait for cert-manager to restart with new DNS config
echo "Waiting for cert-manager to restart with new DNS configuration..." print_info "Waiting for cert-manager to restart with new DNS configuration..."
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
# Apply Let's Encrypt issuers and certificates using kustomize # Apply Let's Encrypt issuers and certificates using kustomize
echo "Creating Let's Encrypt issuers and certificates..." print_info "Creating Let's Encrypt issuers and certificates..."
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
# Wait for issuers to be ready # Wait for issuers to be ready
echo "Waiting for Let's Encrypt issuers to be ready..." print_info "Waiting for Let's Encrypt issuers to be ready..."
sleep 10 kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || print_warning "Production issuer not ready, proceeding anyway..."
echo "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation." kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || print_warning "Staging issuer not ready, proceeding anyway..."
# Validate DNS resolution using temporary test pod
print_info "Validating DNS resolution for ACME challenges..."
domain=$(wild-config cluster.certManager.cloudflare.domain)
print_info "Testing DNS resolution for domain: $domain"
# Create temporary pod with DNS utilities
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \
nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
print_success "DNS resolution working for ACME challenges" || \
print_warning "DNS resolution issues may affect ACME challenges"
print_info "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation."
# Wait for the certificates to be issued (with a timeout) # Wait for the certificates to be issued (with a timeout)
echo "Waiting for wildcard certificates to be ready (this may take several minutes)..." print_info "Waiting for wildcard certificates to be ready (this may take several minutes)..."
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true
echo "cert-manager setup complete!" print_success "cert-manager setup complete!"
echo "" echo ""
echo "To verify the installation:" print_info "To verify the installation:"
echo " kubectl get pods -n cert-manager" print_info " kubectl get pods -n cert-manager"
echo " kubectl get clusterissuers" print_info " kubectl get clusterissuers"
echo " kubectl get certificates -n cert-manager" print_info " kubectl get certificates -n cert-manager"

View File

@@ -16,6 +16,14 @@ EXTERNALDNS_DIR="${CLUSTER_SETUP_DIR}/externaldns"
print_header "Setting up ExternalDNS" print_header "Setting up ExternalDNS"
# Check cert-manager dependency
print_info "Verifying cert-manager is ready (required for ExternalDNS)..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=60s 2>/dev/null && \
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=60s 2>/dev/null || {
print_warning "cert-manager not ready, but continuing with ExternalDNS installation"
print_info "Note: ExternalDNS may not work properly without cert-manager"
}
# Templates should already be compiled by wild-cluster-services-generate # Templates should already be compiled by wild-cluster-services-generate
echo "Using pre-compiled ExternalDNS templates..." echo "Using pre-compiled ExternalDNS templates..."
if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then

View File

@@ -16,6 +16,13 @@ TRAEFIK_DIR="${CLUSTER_SETUP_DIR}/traefik"
print_header "Setting up Traefik ingress controller" print_header "Setting up Traefik ingress controller"
# Check MetalLB dependency
print_info "Verifying MetalLB is ready (required for Traefik LoadBalancer service)..."
kubectl wait --for=condition=Ready pod -l component=controller -n metallb-system --timeout=60s 2>/dev/null || {
print_warning "MetalLB controller not ready, but continuing with Traefik installation"
print_info "Note: Traefik LoadBalancer service may not get external IP without MetalLB"
}
# Install required CRDs first # Install required CRDs first
echo "Installing Gateway API CRDs..." echo "Installing Gateway API CRDs..."
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml