#!/bin/bash # Repair stuck certificates, orphaned ACME orders, and Cloudflare DNS errors. # This is an operational maintenance script, not part of deployment. # Run manually when cert-manager has issues with certificate issuance. # # Usage: KUBECONFIG=/path/to/kubeconfig ./repair-certificates.sh set -e set -o pipefail if [ -z "${KUBECONFIG}" ]; then echo "ERROR: KUBECONFIG is not set" exit 1 fi needs_restart=false echo "=== cert-manager Certificate Repair ===" echo "" echo "Checking for certificates with failed issuance attempts..." stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \ jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"') if [ -n "$stuck_certs" ]; then echo "WARNING: Found certificates stuck with non-existent orders, recreating them..." echo "$stuck_certs" | while read ns name; do echo "Recreating certificate $ns/$name..." cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec') kubectl delete certificate "$name" -n "$ns" echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f - done needs_restart=true sleep 5 else echo "No certificates stuck with failed orders" fi echo "Checking for orphaned ACME orders..." orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \ grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \ sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \ sort -u || true) if [ -n "$orphaned_orders" ]; then echo "WARNING: Found orphaned ACME orders from logs" for order in $orphaned_orders; do echo "Deleting orphaned order: $order" orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true) if [ -n "$orders_found" ]; then echo "$orders_found" | while read ns name rest; do kubectl delete order "$name" -n "$ns" 2>/dev/null || true done fi done needs_restart=true else echo "No orphaned orders found in logs" fi echo "Checking for Cloudflare DNS cleanup errors..." cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \ grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0") if [ "$cloudflare_errors" -gt "0" ]; then echo "WARNING: Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)" echo "Deleting stuck challenges and orders to allow fresh start" kubectl delete challenges --all -n cert-manager 2>/dev/null || true kubectl delete orders --all -n cert-manager 2>/dev/null || true needs_restart=true else echo "No Cloudflare DNS cleanup errors" fi if [ "$needs_restart" = true ]; then echo "Restarting cert-manager to clear internal state..." kubectl rollout restart deployment cert-manager -n cert-manager kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s echo "Waiting for cert-manager to recreate fresh challenges..." sleep 15 else echo "No restart needed - cert-manager state is clean" fi echo "" echo "Repair complete. Check certificate status with:" echo " kubectl get certificates --all-namespaces" echo " kubectl get clusterissuers"