From fb76eb5b47b26a38a5a6e0f5adb277ad1dade85b Mon Sep 17 00:00:00 2001 From: Paul Payne Date: Sun, 28 Sep 2025 15:27:57 -0700 Subject: [PATCH] Adds reliability checks to cert-manager, externaldns, and traefik setup. --- .../cert-manager/configure.sh | 1 + .../cluster-services/cert-manager/install.sh | 105 ++++++++++++++---- ...r.yaml => cert-manager-for-reference.yaml} | 0 setup/cluster-services/externaldns/install.sh | 8 ++ setup/cluster-services/traefik/install.sh | 7 ++ 5 files changed, 98 insertions(+), 23 deletions(-) rename setup/cluster-services/cert-manager/kustomize.template/{cert-manager.yaml => cert-manager-for-reference.yaml} (100%) diff --git a/setup/cluster-services/cert-manager/configure.sh b/setup/cluster-services/cert-manager/configure.sh index 5466ce9..b2fa8ac 100644 --- a/setup/cluster-services/cert-manager/configure.sh +++ b/setup/cluster-services/cert-manager/configure.sh @@ -7,4 +7,5 @@ domain=$(wild-config "cloud.domain") prompt_if_unset_config "cloud.internalDomain" "Enter internal domain name" "local.${domain}" prompt_if_unset_config "operator.email" "Enter operator email address (for Let's Encrypt)" "" prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain (for DNS challenges)" "${domain}" +prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID (for DNS challenges - improves reliability)" "" prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token (for DNS challenges)" "" diff --git a/setup/cluster-services/cert-manager/install.sh b/setup/cluster-services/cert-manager/install.sh index 67e55b5..ba5310e 100755 --- a/setup/cluster-services/cert-manager/install.sh +++ b/setup/cluster-services/cert-manager/install.sh @@ -16,42 +16,88 @@ CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager" print_header "Setting up cert-manager" -# Templates should already be compiled by wild-cluster-services-generate -echo "Using pre-compiled cert-manager templates..." +# Check Traefik dependency +print_info "Verifying Traefik is ready (required for cert-manager)..." +kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || { + print_warning "Traefik not ready, but continuing with cert-manager installation" + print_info "Note: cert-manager may not work properly without Traefik" +} + +# Templates should already be compiled by wild-cluster-services-configure +print_info "Using pre-compiled cert-manager templates..." if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then - echo "Error: Compiled templates not found. Run 'wild-cluster-services-generate' first." + print_error "Compiled templates not found. Run 'wild-cluster-services-configure' first." exit 1 fi -echo "Setting up cert-manager..." +print_info "Setting up cert-manager..." -# Install cert-manager using the official installation method +# Install cert-manager using the official installation method # This installs CRDs, controllers, and webhook components -echo "Installing cert-manager components..." +print_info "Installing cert-manager components..." # Using stable URL for cert-manager installation kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \ kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml # Wait for cert-manager to be ready -echo "Waiting for cert-manager to be ready..." +print_info "Waiting for cert-manager to be ready..." kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s -# Add delay to allow webhook to be fully ready -echo "Waiting additional time for cert-manager webhook to be fully operational..." -sleep 30 +# Ensure webhook is fully operational +print_info "Verifying cert-manager webhook is fully operational..." +until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do + print_info "Waiting for cert-manager webhook to register..." + sleep 5 +done + +# Test webhook connectivity before proceeding +print_info "Testing webhook connectivity..." +kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager + # Setup Cloudflare API token for DNS01 challenges -echo "Creating Cloudflare API token secret..." +print_info "Creating Cloudflare API token secret..." CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1 + +# Validate Cloudflare API token permissions +print_info "Validating Cloudflare API token permissions..." +validate_cloudflare_token() { + local token="$1" + if ! command -v curl &>/dev/null; then + print_warning "curl not available, skipping token validation" + return 0 + fi + + print_info "Testing Cloudflare API token..." + local response + response=$(curl -s -H "Authorization: Bearer $token" \ + "https://api.cloudflare.com/client/v4/zones") + + if echo "$response" | grep -q '"success":true'; then + print_success "Cloudflare API token is valid and has zone access" + return 0 + else + print_error "Cloudflare token validation failed" + print_info "Response: $response" + print_info "Please ensure your token has Zone - Zone - Read permission" + return 1 + fi +} + +validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || { + print_error "Cloudflare token validation failed. Please check token permissions." + print_info "Required permissions: Zone - Zone - Read, Zone - DNS - Edit" + exit 1 +} kubectl create secret generic cloudflare-api-token \ --namespace cert-manager \ --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \ --dry-run=client -o yaml | kubectl apply -f - # Configure cert-manager to use external DNS for challenge verification -echo "Configuring cert-manager to use external DNS servers..." +print_info "Configuring cert-manager to use external DNS servers..." kubectl patch deployment cert-manager -n cert-manager --patch ' spec: template: @@ -70,26 +116,39 @@ spec: value: "5"' # Wait for cert-manager to restart with new DNS config -echo "Waiting for cert-manager to restart with new DNS configuration..." +print_info "Waiting for cert-manager to restart with new DNS configuration..." kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s # Apply Let's Encrypt issuers and certificates using kustomize -echo "Creating Let's Encrypt issuers and certificates..." +print_info "Creating Let's Encrypt issuers and certificates..." kubectl apply -k ${CERT_MANAGER_DIR}/kustomize # Wait for issuers to be ready -echo "Waiting for Let's Encrypt issuers to be ready..." -sleep 10 -echo "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation." +print_info "Waiting for Let's Encrypt issuers to be ready..." +kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || print_warning "Production issuer not ready, proceeding anyway..." +kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || print_warning "Staging issuer not ready, proceeding anyway..." + +# Validate DNS resolution using temporary test pod +print_info "Validating DNS resolution for ACME challenges..." +domain=$(wild-config cluster.certManager.cloudflare.domain) +print_info "Testing DNS resolution for domain: $domain" + +# Create temporary pod with DNS utilities +kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \ + nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \ + print_success "DNS resolution working for ACME challenges" || \ + print_warning "DNS resolution issues may affect ACME challenges" + +print_info "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation." # Wait for the certificates to be issued (with a timeout) -echo "Waiting for wildcard certificates to be ready (this may take several minutes)..." +print_info "Waiting for wildcard certificates to be ready (this may take several minutes)..." kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true -echo "cert-manager setup complete!" +print_success "cert-manager setup complete!" echo "" -echo "To verify the installation:" -echo " kubectl get pods -n cert-manager" -echo " kubectl get clusterissuers" -echo " kubectl get certificates -n cert-manager" +print_info "To verify the installation:" +print_info " kubectl get pods -n cert-manager" +print_info " kubectl get clusterissuers" +print_info " kubectl get certificates -n cert-manager" diff --git a/setup/cluster-services/cert-manager/kustomize.template/cert-manager.yaml b/setup/cluster-services/cert-manager/kustomize.template/cert-manager-for-reference.yaml similarity index 100% rename from setup/cluster-services/cert-manager/kustomize.template/cert-manager.yaml rename to setup/cluster-services/cert-manager/kustomize.template/cert-manager-for-reference.yaml diff --git a/setup/cluster-services/externaldns/install.sh b/setup/cluster-services/externaldns/install.sh index cfbc500..98f961f 100755 --- a/setup/cluster-services/externaldns/install.sh +++ b/setup/cluster-services/externaldns/install.sh @@ -16,6 +16,14 @@ EXTERNALDNS_DIR="${CLUSTER_SETUP_DIR}/externaldns" print_header "Setting up ExternalDNS" +# Check cert-manager dependency +print_info "Verifying cert-manager is ready (required for ExternalDNS)..." +kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=60s 2>/dev/null && \ +kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=60s 2>/dev/null || { + print_warning "cert-manager not ready, but continuing with ExternalDNS installation" + print_info "Note: ExternalDNS may not work properly without cert-manager" +} + # Templates should already be compiled by wild-cluster-services-generate echo "Using pre-compiled ExternalDNS templates..." if [ ! -d "${EXTERNALDNS_DIR}/kustomize" ]; then diff --git a/setup/cluster-services/traefik/install.sh b/setup/cluster-services/traefik/install.sh index d0c774c..8ee3e93 100755 --- a/setup/cluster-services/traefik/install.sh +++ b/setup/cluster-services/traefik/install.sh @@ -16,6 +16,13 @@ TRAEFIK_DIR="${CLUSTER_SETUP_DIR}/traefik" print_header "Setting up Traefik ingress controller" +# Check MetalLB dependency +print_info "Verifying MetalLB is ready (required for Traefik LoadBalancer service)..." +kubectl wait --for=condition=Ready pod -l component=controller -n metallb-system --timeout=60s 2>/dev/null || { + print_warning "MetalLB controller not ready, but continuing with Traefik installation" + print_info "Note: Traefik LoadBalancer service may not get external IP without MetalLB" +} + # Install required CRDs first echo "Installing Gateway API CRDs..." kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.0.0/standard-install.yaml