From d06e27931c77b2dc8bf1c23f35b64b0580c1eb69 Mon Sep 17 00:00:00 2001 From: Paul Payne Date: Sat, 4 Oct 2025 08:28:43 -0700 Subject: [PATCH] Cert-manager setup reliability. --- .../cert-manager/configure.sh | 7 +-- .../cluster-services/cert-manager/install.sh | 48 ++++++++++++------- .../letsencrypt-prod-dns01.yaml | 1 - .../letsencrypt-staging-dns01.yaml | 1 - 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/setup/cluster-services/cert-manager/configure.sh b/setup/cluster-services/cert-manager/configure.sh index b2fa8ac..c0e877a 100644 --- a/setup/cluster-services/cert-manager/configure.sh +++ b/setup/cluster-services/cert-manager/configure.sh @@ -4,8 +4,9 @@ print_info "Collecting cert-manager configuration..." prompt_if_unset_config "cloud.domain" "Enter main domain name" "example.com" domain=$(wild-config "cloud.domain") +baseDomain=$(wild-config "cloud.baseDomain") prompt_if_unset_config "cloud.internalDomain" "Enter internal domain name" "local.${domain}" prompt_if_unset_config "operator.email" "Enter operator email address (for Let's Encrypt)" "" -prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain (for DNS challenges)" "${domain}" -prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID (for DNS challenges - improves reliability)" "" -prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token (for DNS challenges)" "" +prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain" "${baseDomain}" +prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID" "" +prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token" "" diff --git a/setup/cluster-services/cert-manager/install.sh b/setup/cluster-services/cert-manager/install.sh index 738ff51..16e416c 100755 --- a/setup/cluster-services/cert-manager/install.sh +++ b/setup/cluster-services/cert-manager/install.sh @@ -37,8 +37,8 @@ print_info "Validating DNS resolution for ACME challenges..." domain=$(wild-config cluster.certManager.cloudflare.domain) print_info "Testing DNS resolution for domain: $domain" -# Create temporary pod with DNS utilities -kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \ +# Create temporary pod with DNS utilities (in default namespace since cert-manager doesn't exist yet) +kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -- \ nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \ print_success "DNS resolution working for ACME challenges" || \ print_warning "DNS resolution issues may affect ACME challenges" @@ -87,12 +87,6 @@ validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || { exit 1 } -# Ensure token is in the cluster -kubectl create secret generic cloudflare-api-token \ - --namespace cert-manager \ - --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \ - --dry-run=client -o yaml | kubectl apply -f - - ######################## # Kubernetes components ######################## @@ -108,6 +102,13 @@ kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager - kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s +# Now that cert-manager namespace exists, create the Cloudflare API token secret +print_info "Creating Cloudflare API token secret..." +kubectl create secret generic cloudflare-api-token \ + --namespace cert-manager \ + --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + # Ensure webhook is fully operational print_info "Verifying cert-manager webhook is fully operational..." until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do @@ -265,7 +266,8 @@ fi # STEP 4: Check for DNS errors dns_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=50 2>/dev/null | \ - grep -c "Could not route to /client/v4/zones/dns_records" || echo 0) + grep "Could not route to /client/v4/zones/dns_records" | wc -l | tr -d '\n' || echo "0") +dns_errors=${dns_errors:-0} if [ "$dns_errors" -gt 0 ]; then print_warning "Cert-manager has DNS record cleanup errors" @@ -292,10 +294,18 @@ fi print_info "Checking certificate expiration status..." current_date=$(date +%s) -# Track if any renewals were triggered -renewals_triggered=0 +# Track if we found any issues +found_expired=false +found_expiring_soon=false +all_certs_valid=true + +# Process certificates and collect their status +while IFS= read -r line; do + ns=$(echo "$line" | awk '{print $1}') + name=$(echo "$line" | awk '{print $2}') + secret=$(echo "$line" | awk '{print $3}') + expiry=$(echo "$line" | awk '{print $4}') -kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name) \(.spec.secretName) \(.status.notAfter // "unknown")"' | while read ns name secret expiry; do if [ "$expiry" != "unknown" ] && [ "$expiry" != "null" ] && [ "$expiry" != "" ]; then expiry_ts=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$expiry" +%s 2>/dev/null || echo 0) if [ "$expiry_ts" -gt 0 ]; then @@ -303,19 +313,21 @@ kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] if [ "$days_until_expiry" -lt 0 ]; then print_warning "Certificate $ns/$name has EXPIRED (expired ${days_until_expiry#-} days ago)" - if [ -n "$secret" ]; then + if [ -n "$secret" ] && [ "$secret" != "unknown" ] && [ "$secret" != "null" ]; then print_info "Deleting secret $secret to trigger renewal..." kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true - renewals_triggered=$((renewals_triggered + 1)) + found_expired=true + all_certs_valid=false fi elif [ "$days_until_expiry" -lt 7 ]; then print_warning "Certificate $ns/$name expires in $days_until_expiry days" if [ "$days_until_expiry" -lt 3 ]; then # Force renewal for certificates expiring very soon - if [ -n "$secret" ]; then + if [ -n "$secret" ] && [ "$secret" != "unknown" ] && [ "$secret" != "null" ]; then print_info "Forcing renewal by deleting secret $secret..." kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true - renewals_triggered=$((renewals_triggered + 1)) + found_expiring_soon=true + all_certs_valid=false fi else print_info "Will renew automatically when closer to expiry" @@ -330,9 +342,9 @@ kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] # Certificate has no expiry (being issued) print_info "Certificate $ns/$name is currently being issued..." fi -done +done < <(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name) \(.spec.secretName) \(.status.notAfter // "unknown")"') -if [ "$renewals_triggered" -eq 0 ]; then +if [ "$all_certs_valid" = true ]; then print_success "All certificates are valid - no renewals needed" fi diff --git a/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-prod-dns01.yaml b/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-prod-dns01.yaml index 2800c21..7d5c272 100644 --- a/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-prod-dns01.yaml +++ b/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-prod-dns01.yaml @@ -13,7 +13,6 @@ spec: # DNS-01 solver for wildcard certificates - dns01: cloudflare: - email: {{ .operator.email }} apiTokenSecretRef: name: cloudflare-api-token key: api-token diff --git a/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-staging-dns01.yaml b/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-staging-dns01.yaml index b1e9edf..784aebc 100644 --- a/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-staging-dns01.yaml +++ b/setup/cluster-services/cert-manager/kustomize.template/letsencrypt-staging-dns01.yaml @@ -13,7 +13,6 @@ spec: # DNS-01 solver for wildcard certificates - dns01: cloudflare: - email: {{ .operator.email }} apiTokenSecretRef: name: cloudflare-api-token key: api-token