Adds reliability checks to cert-manager, externaldns, and traefik setup.

This commit is contained in:
2025-09-28 15:27:57 -07:00
parent 4d04b54f64
commit fb76eb5b47
5 changed files with 98 additions and 23 deletions

View File

@@ -7,4 +7,5 @@ domain=$(wild-config "cloud.domain")
prompt_if_unset_config "cloud.internalDomain" "Enter internal domain name" "local.${domain}"
prompt_if_unset_config "operator.email" "Enter operator email address (for Let's Encrypt)" ""
prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain (for DNS challenges)" "${domain}"
prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID (for DNS challenges - improves reliability)" ""
prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token (for DNS challenges)" ""

View File

@@ -16,42 +16,88 @@ CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
print_header "Setting up cert-manager"
# Templates should already be compiled by wild-cluster-services-generate
echo "Using pre-compiled cert-manager templates..."
# Check Traefik dependency
print_info "Verifying Traefik is ready (required for cert-manager)..."
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
print_warning "Traefik not ready, but continuing with cert-manager installation"
print_info "Note: cert-manager may not work properly without Traefik"
}
# Templates should already be compiled by wild-cluster-services-configure
print_info "Using pre-compiled cert-manager templates..."
if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first."
print_error "Compiled templates not found. Run 'wild-cluster-services-configure' first."
exit 1
fi
echo "Setting up cert-manager..."
print_info "Setting up cert-manager..."
# Install cert-manager using the official installation method
# Install cert-manager using the official installation method
# This installs CRDs, controllers, and webhook components
echo "Installing cert-manager components..."
print_info "Installing cert-manager components..."
# Using stable URL for cert-manager installation
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
# Wait for cert-manager to be ready
echo "Waiting for cert-manager to be ready..."
print_info "Waiting for cert-manager to be ready..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
# Add delay to allow webhook to be fully ready
echo "Waiting additional time for cert-manager webhook to be fully operational..."
sleep 30
# Ensure webhook is fully operational
print_info "Verifying cert-manager webhook is fully operational..."
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
print_info "Waiting for cert-manager webhook to register..."
sleep 5
done
# Test webhook connectivity before proceeding
print_info "Testing webhook connectivity..."
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager
# Setup Cloudflare API token for DNS01 challenges
echo "Creating Cloudflare API token secret..."
print_info "Creating Cloudflare API token secret..."
CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1
# Validate Cloudflare API token permissions
print_info "Validating Cloudflare API token permissions..."
validate_cloudflare_token() {
local token="$1"
if ! command -v curl &>/dev/null; then
print_warning "curl not available, skipping token validation"
return 0
fi
print_info "Testing Cloudflare API token..."
local response
response=$(curl -s -H "Authorization: Bearer $token" \
"https://api.cloudflare.com/client/v4/zones")
if echo "$response" | grep -q '"success":true'; then
print_success "Cloudflare API token is valid and has zone access"
return 0
else
print_error "Cloudflare token validation failed"
print_info "Response: $response"
print_info "Please ensure your token has Zone - Zone - Read permission"
return 1
fi
}
validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || {
print_error "Cloudflare token validation failed. Please check token permissions."
print_info "Required permissions: Zone - Zone - Read, Zone - DNS - Edit"
exit 1
}
kubectl create secret generic cloudflare-api-token \
--namespace cert-manager \
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
--dry-run=client -o yaml | kubectl apply -f -
# Configure cert-manager to use external DNS for challenge verification
echo "Configuring cert-manager to use external DNS servers..."
print_info "Configuring cert-manager to use external DNS servers..."
kubectl patch deployment cert-manager -n cert-manager --patch '
spec:
template:
@@ -70,26 +116,39 @@ spec:
value: "5"'
# Wait for cert-manager to restart with new DNS config
echo "Waiting for cert-manager to restart with new DNS configuration..."
print_info "Waiting for cert-manager to restart with new DNS configuration..."
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
# Apply Let's Encrypt issuers and certificates using kustomize
echo "Creating Let's Encrypt issuers and certificates..."
print_info "Creating Let's Encrypt issuers and certificates..."
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
# Wait for issuers to be ready
echo "Waiting for Let's Encrypt issuers to be ready..."
sleep 10
echo "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation."
print_info "Waiting for Let's Encrypt issuers to be ready..."
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || print_warning "Production issuer not ready, proceeding anyway..."
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || print_warning "Staging issuer not ready, proceeding anyway..."
# Validate DNS resolution using temporary test pod
print_info "Validating DNS resolution for ACME challenges..."
domain=$(wild-config cluster.certManager.cloudflare.domain)
print_info "Testing DNS resolution for domain: $domain"
# Create temporary pod with DNS utilities
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \
nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
print_success "DNS resolution working for ACME challenges" || \
print_warning "DNS resolution issues may affect ACME challenges"
print_info "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation."
# Wait for the certificates to be issued (with a timeout)
echo "Waiting for wildcard certificates to be ready (this may take several minutes)..."
print_info "Waiting for wildcard certificates to be ready (this may take several minutes)..."
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true
echo "cert-manager setup complete!"
print_success "cert-manager setup complete!"
echo ""
echo "To verify the installation:"
echo " kubectl get pods -n cert-manager"
echo " kubectl get clusterissuers"
echo " kubectl get certificates -n cert-manager"
print_info "To verify the installation:"
print_info " kubectl get pods -n cert-manager"
print_info " kubectl get clusterissuers"
print_info " kubectl get certificates -n cert-manager"

View File

@@ -16,6 +16,14 @@ EXTERNALDNS_DIR="${CLUSTER_SETUP_DIR}/externaldns"
print_header "Setting up ExternalDNS"
# Check cert-manager dependency
print_info "Verifying cert-manager is ready (required for ExternalDNS)..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=60s 2>/dev/null && \
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=60s 2>/dev/null || {
print_warning "cert-manager not ready, but continuing with ExternalDNS installation"
print_info "Note: ExternalDNS may not work properly without cert-manager"
}
# Templates should already be compiled by wild-cluster-services-generate
echo "Using pre-compiled ExternalDNS templates..."
if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then

View File

@@ -16,6 +16,13 @@ TRAEFIK_DIR="${CLUSTER_SETUP_DIR}/traefik"
print_header "Setting up Traefik ingress controller"
# Check MetalLB dependency
print_info "Verifying MetalLB is ready (required for Traefik LoadBalancer service)..."
kubectl wait --for=condition=Ready pod -l component=controller -n metallb-system --timeout=60s 2>/dev/null || {
print_warning "MetalLB controller not ready, but continuing with Traefik installation"
print_info "Note: Traefik LoadBalancer service may not get external IP without MetalLB"
}
# Install required CRDs first
echo "Installing Gateway API CRDs..."
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml