- Introduced a new kustomization.yaml file for cert-manager. - Configured a patch to modify the cert-manager Deployment to use a custom DNS policy and settings. - Set dnsPolicy to None and specified custom nameservers and search options.
90 lines
3.4 KiB
Bash
Executable File
90 lines
3.4 KiB
Bash
Executable File
#!/bin/bash
|
|
# Repair stuck certificates, orphaned ACME orders, and Cloudflare DNS errors.
|
|
# This is an operational maintenance script, not part of deployment.
|
|
# Run manually when cert-manager has issues with certificate issuance.
|
|
#
|
|
# Usage: KUBECONFIG=/path/to/kubeconfig ./repair-certificates.sh
|
|
set -e
|
|
set -o pipefail
|
|
|
|
if [ -z "${KUBECONFIG}" ]; then
|
|
echo "ERROR: KUBECONFIG is not set"
|
|
exit 1
|
|
fi
|
|
|
|
needs_restart=false
|
|
|
|
echo "=== cert-manager Certificate Repair ==="
|
|
echo ""
|
|
|
|
echo "Checking for certificates with failed issuance attempts..."
|
|
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
|
|
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
|
|
|
|
if [ -n "$stuck_certs" ]; then
|
|
echo "WARNING: Found certificates stuck with non-existent orders, recreating them..."
|
|
echo "$stuck_certs" | while read ns name; do
|
|
echo "Recreating certificate $ns/$name..."
|
|
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
|
|
kubectl delete certificate "$name" -n "$ns"
|
|
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
|
|
done
|
|
needs_restart=true
|
|
sleep 5
|
|
else
|
|
echo "No certificates stuck with failed orders"
|
|
fi
|
|
|
|
echo "Checking for orphaned ACME orders..."
|
|
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
|
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
|
|
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
|
|
sort -u || true)
|
|
|
|
if [ -n "$orphaned_orders" ]; then
|
|
echo "WARNING: Found orphaned ACME orders from logs"
|
|
for order in $orphaned_orders; do
|
|
echo "Deleting orphaned order: $order"
|
|
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
|
|
if [ -n "$orders_found" ]; then
|
|
echo "$orders_found" | while read ns name rest; do
|
|
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
|
done
|
|
fi
|
|
done
|
|
needs_restart=true
|
|
else
|
|
echo "No orphaned orders found in logs"
|
|
fi
|
|
|
|
echo "Checking for Cloudflare DNS cleanup errors..."
|
|
cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
|
grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
|
|
|
|
if [ "$cloudflare_errors" -gt "0" ]; then
|
|
echo "WARNING: Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
|
|
echo "Deleting stuck challenges and orders to allow fresh start"
|
|
|
|
kubectl delete challenges --all -n cert-manager 2>/dev/null || true
|
|
kubectl delete orders --all -n cert-manager 2>/dev/null || true
|
|
|
|
needs_restart=true
|
|
else
|
|
echo "No Cloudflare DNS cleanup errors"
|
|
fi
|
|
|
|
if [ "$needs_restart" = true ]; then
|
|
echo "Restarting cert-manager to clear internal state..."
|
|
kubectl rollout restart deployment cert-manager -n cert-manager
|
|
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
|
echo "Waiting for cert-manager to recreate fresh challenges..."
|
|
sleep 15
|
|
else
|
|
echo "No restart needed - cert-manager state is clean"
|
|
fi
|
|
|
|
echo ""
|
|
echo "Repair complete. Check certificate status with:"
|
|
echo " kubectl get certificates --all-namespaces"
|
|
echo " kubectl get clusterissuers"
|