cert-manager setup improvements. idempotent. checks and cleans up failures. converges to reliable state.
This commit is contained in:
@@ -220,8 +220,7 @@ This approach prevents naming conflicts between apps and makes secret keys more
|
||||
Apps in Wild Cloud are managed by operators using a set of commands run from their Wild Cloud home directory.
|
||||
|
||||
- `wild-apps-list`: Lists all available apps.
|
||||
- `wild-app-fetch <app-name>`: Fetches the latest app files from the Wild Cloud repository and stores them in your Wild Cloud cache.
|
||||
- `wild-app-add <app-name>`: Adds the app manifest to your Wild Cloud home `apps` directory, updates missing values in `config.yaml` and `secrets.yaml` with the app's default configurations, and compiles the app's Kustomize files.
|
||||
- `wild-app-add <app-name>`: Reads the app from the Wild Cloud repository, adds the app manifest to your Wild Cloud home `apps` directory, updates missing values in `config.yaml` and `secrets.yaml` with the app's default configurations, and compiles the app's Kustomize files.
|
||||
- `wild-app-deploy <app-name>`: Deploys the app to your Wild Cloud.
|
||||
|
||||
## Contributing
|
||||
|
@@ -16,6 +16,10 @@ CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
|
||||
|
||||
print_header "Setting up cert-manager"
|
||||
|
||||
#######################
|
||||
# # Dependencies
|
||||
#######################
|
||||
|
||||
# Check Traefik dependency
|
||||
print_info "Verifying Traefik is ready (required for cert-manager)..."
|
||||
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
|
||||
@@ -23,45 +27,36 @@ kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=6
|
||||
print_info "Note: cert-manager may not work properly without Traefik"
|
||||
}
|
||||
|
||||
# Templates should already be compiled by wild-cluster-services-configure
|
||||
print_info "Using pre-compiled cert-manager templates..."
|
||||
if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
|
||||
print_error "Compiled templates not found. Run 'wild-cluster-services-configure' first."
|
||||
print_error "Compiled templates not found. This script should not be run directly. Run with 'wild setup cluster-services cert-manager' instead."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_info "Setting up cert-manager..."
|
||||
# Validate DNS resolution using temporary test pod
|
||||
print_info "Validating DNS resolution for ACME challenges..."
|
||||
domain=$(wild-config cluster.certManager.cloudflare.domain)
|
||||
print_info "Testing DNS resolution for domain: $domain"
|
||||
|
||||
# Install cert-manager using the official installation method
|
||||
# This installs CRDs, controllers, and webhook components
|
||||
print_info "Installing cert-manager components..."
|
||||
# Using stable URL for cert-manager installation
|
||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
|
||||
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
|
||||
|
||||
# Wait for cert-manager to be ready
|
||||
print_info "Waiting for cert-manager to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
|
||||
|
||||
# Ensure webhook is fully operational
|
||||
print_info "Verifying cert-manager webhook is fully operational..."
|
||||
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
|
||||
print_info "Waiting for cert-manager webhook to register..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Test webhook connectivity before proceeding
|
||||
print_info "Testing webhook connectivity..."
|
||||
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager
|
||||
# Create temporary pod with DNS utilities
|
||||
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \
|
||||
nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
|
||||
print_success "DNS resolution working for ACME challenges" || \
|
||||
print_warning "DNS resolution issues may affect ACME challenges"
|
||||
|
||||
|
||||
# Setup Cloudflare API token for DNS01 challenges
|
||||
print_info "Creating Cloudflare API token secret..."
|
||||
########################
|
||||
# Cloudflare DNS setup
|
||||
########################
|
||||
|
||||
# API token secret setup
|
||||
print_info "Reading Cloudflare API token secret..."
|
||||
CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1
|
||||
if [ -z "$CLOUDFLARE_API_TOKEN" ]; then
|
||||
print_error "Cloudflare API token not found. Please create it with 'wild secret create cloudflare.token'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate Cloudflare API token permissions
|
||||
# Validate token
|
||||
print_info "Validating Cloudflare API token permissions..."
|
||||
validate_cloudflare_token() {
|
||||
local token="$1"
|
||||
@@ -91,11 +86,39 @@ validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || {
|
||||
print_info "Required permissions: Zone - Zone - Read, Zone - DNS - Edit"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Ensure token is in the cluster
|
||||
kubectl create secret generic cloudflare-api-token \
|
||||
--namespace cert-manager \
|
||||
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
########################
|
||||
# Kubernetes components
|
||||
########################
|
||||
|
||||
print_info "Installing cert-manager components..."
|
||||
# Using stable URL for cert-manager installation
|
||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
|
||||
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
|
||||
|
||||
# Wait for cert-manager to be ready
|
||||
print_info "Waiting for cert-manager to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
|
||||
|
||||
# Ensure webhook is fully operational
|
||||
print_info "Verifying cert-manager webhook is fully operational..."
|
||||
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
|
||||
print_info "Waiting for cert-manager webhook to register..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Test webhook connectivity before proceeding
|
||||
print_info "Testing webhook connectivity..."
|
||||
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager
|
||||
|
||||
# Configure cert-manager to use external DNS for challenge verification
|
||||
print_info "Configuring cert-manager to use external DNS servers..."
|
||||
kubectl patch deployment cert-manager -n cert-manager --patch '
|
||||
@@ -119,6 +142,10 @@ spec:
|
||||
print_info "Waiting for cert-manager to restart with new DNS configuration..."
|
||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||
|
||||
########################
|
||||
# Create issuers and certificates
|
||||
########################
|
||||
|
||||
# Apply Let's Encrypt issuers and certificates using kustomize
|
||||
print_info "Creating Let's Encrypt issuers and certificates..."
|
||||
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
|
||||
@@ -128,27 +155,206 @@ print_info "Waiting for Let's Encrypt issuers to be ready..."
|
||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || print_warning "Production issuer not ready, proceeding anyway..."
|
||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || print_warning "Staging issuer not ready, proceeding anyway..."
|
||||
|
||||
# Validate DNS resolution using temporary test pod
|
||||
print_info "Validating DNS resolution for ACME challenges..."
|
||||
domain=$(wild-config cluster.certManager.cloudflare.domain)
|
||||
print_info "Testing DNS resolution for domain: $domain"
|
||||
# Give cert-manager a moment to process the certificates
|
||||
sleep 5
|
||||
|
||||
# Create temporary pod with DNS utilities
|
||||
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \
|
||||
nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
|
||||
print_success "DNS resolution working for ACME challenges" || \
|
||||
print_warning "DNS resolution issues may affect ACME challenges"
|
||||
######################################
|
||||
# Fix stuck certificates and cleanup
|
||||
######################################
|
||||
|
||||
print_info "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation."
|
||||
needs_restart=false
|
||||
|
||||
# STEP 1: Fix certificates stuck with 404 errors FIRST (before cleaning up orders)
|
||||
print_info "Checking for certificates with failed issuance attempts..."
|
||||
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$stuck_certs" ]; then
|
||||
print_warning "Found certificates stuck with non-existent orders, recreating them..."
|
||||
echo "$stuck_certs" | while read ns name; do
|
||||
print_info "Recreating certificate $ns/$name..."
|
||||
# Get just the spec
|
||||
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
|
||||
# Delete the certificate
|
||||
kubectl delete certificate "$name" -n "$ns"
|
||||
# Recreate with clean state
|
||||
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
|
||||
done
|
||||
needs_restart=true
|
||||
# Give cert-manager time to process the recreated certificates
|
||||
sleep 5
|
||||
else
|
||||
print_success "No certificates stuck with failed orders"
|
||||
fi
|
||||
|
||||
# STEP 2: Clean up orphaned orders (after fixing certificates)
|
||||
print_info "Checking for orphaned ACME orders..."
|
||||
|
||||
# Check logs for 404 errors
|
||||
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
||||
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
|
||||
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
|
||||
sort -u || true)
|
||||
|
||||
if [ -n "$orphaned_orders" ]; then
|
||||
print_warning "Found orphaned ACME orders from logs"
|
||||
for order in $orphaned_orders; do
|
||||
print_info "Deleting orphaned order: $order"
|
||||
# Find and delete the order in whatever namespace it exists
|
||||
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
|
||||
if [ -n "$orders_found" ]; then
|
||||
echo "$orders_found" | while read ns name rest; do
|
||||
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
fi
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
print_success "No orphaned orders found in logs"
|
||||
fi
|
||||
|
||||
# Check for errored state orders
|
||||
errored_orders=$(kubectl get orders --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.state == "errored") | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$errored_orders" ]; then
|
||||
print_warning "Found errored ACME orders"
|
||||
echo "$errored_orders" | while read ns name; do
|
||||
print_info "Deleting errored order: $ns/$name"
|
||||
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
print_success "No errored orders found"
|
||||
fi
|
||||
|
||||
# STEP 3: Clean up bad challenges
|
||||
print_info "Checking for stuck ACME challenges..."
|
||||
|
||||
# Delete expired, invalid, or errored challenges
|
||||
bad_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.state == "expired" or .status.state == "invalid" or .status.state == "errored") | "\(.metadata.namespace) \(.metadata.name) \(.status.state)"')
|
||||
|
||||
if [ -n "$bad_challenges" ]; then
|
||||
print_warning "Found stuck ACME challenges"
|
||||
echo "$bad_challenges" | while read ns name state; do
|
||||
print_info "Deleting $state challenge: $ns/$name"
|
||||
kubectl delete challenge "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
print_success "No stuck challenges found"
|
||||
fi
|
||||
|
||||
# Delete very old challenges (over 1 hour) - only if they exist
|
||||
all_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | jq '.items | length' || echo 0)
|
||||
if [ "$all_challenges" -gt 0 ]; then
|
||||
old_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r --arg cutoff "$(date -u -d '1 hour ago' '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || date -u -v-1H '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null)" \
|
||||
'.items[] | select(.metadata.creationTimestamp < $cutoff) | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$old_challenges" ]; then
|
||||
print_warning "Found old challenges (over 1 hour)"
|
||||
echo "$old_challenges" | while read ns name; do
|
||||
print_info "Deleting old challenge: $ns/$name"
|
||||
kubectl delete challenge "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
needs_restart=true
|
||||
fi
|
||||
fi
|
||||
|
||||
# STEP 4: Check for DNS errors
|
||||
dns_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=50 2>/dev/null | \
|
||||
grep -c "Could not route to /client/v4/zones/dns_records" || echo 0)
|
||||
|
||||
if [ "$dns_errors" -gt 0 ]; then
|
||||
print_warning "Cert-manager has DNS record cleanup errors"
|
||||
needs_restart=true
|
||||
fi
|
||||
|
||||
# STEP 5: Single restart if anything needs cleaning
|
||||
if [ "$needs_restart" = true ]; then
|
||||
print_info "Restarting cert-manager once to clear all internal state..."
|
||||
kubectl rollout restart deployment cert-manager -n cert-manager
|
||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||
# Give cert-manager time to reinitialize
|
||||
sleep 10
|
||||
else
|
||||
print_success "No restart needed - cert-manager state is clean"
|
||||
fi
|
||||
|
||||
|
||||
##################################
|
||||
# Handle certificate renewal
|
||||
##################################
|
||||
|
||||
# Check for expired or near-expiry certificates and trigger renewal
|
||||
print_info "Checking certificate expiration status..."
|
||||
current_date=$(date +%s)
|
||||
|
||||
# Track if any renewals were triggered
|
||||
renewals_triggered=0
|
||||
|
||||
kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name) \(.spec.secretName) \(.status.notAfter // "unknown")"' | while read ns name secret expiry; do
|
||||
if [ "$expiry" != "unknown" ] && [ "$expiry" != "null" ] && [ "$expiry" != "" ]; then
|
||||
expiry_ts=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$expiry" +%s 2>/dev/null || echo 0)
|
||||
if [ "$expiry_ts" -gt 0 ]; then
|
||||
days_until_expiry=$(( (expiry_ts - current_date) / 86400 ))
|
||||
|
||||
if [ "$days_until_expiry" -lt 0 ]; then
|
||||
print_warning "Certificate $ns/$name has EXPIRED (expired ${days_until_expiry#-} days ago)"
|
||||
if [ -n "$secret" ]; then
|
||||
print_info "Deleting secret $secret to trigger renewal..."
|
||||
kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true
|
||||
renewals_triggered=$((renewals_triggered + 1))
|
||||
fi
|
||||
elif [ "$days_until_expiry" -lt 7 ]; then
|
||||
print_warning "Certificate $ns/$name expires in $days_until_expiry days"
|
||||
if [ "$days_until_expiry" -lt 3 ]; then
|
||||
# Force renewal for certificates expiring very soon
|
||||
if [ -n "$secret" ]; then
|
||||
print_info "Forcing renewal by deleting secret $secret..."
|
||||
kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true
|
||||
renewals_triggered=$((renewals_triggered + 1))
|
||||
fi
|
||||
else
|
||||
print_info "Will renew automatically when closer to expiry"
|
||||
fi
|
||||
elif [ "$days_until_expiry" -lt 30 ]; then
|
||||
print_info "Certificate $ns/$name expires in $days_until_expiry days (renewal not needed yet)"
|
||||
else
|
||||
print_success "Certificate $ns/$name is valid for $days_until_expiry days"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# Certificate has no expiry (being issued)
|
||||
print_info "Certificate $ns/$name is currently being issued..."
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$renewals_triggered" -eq 0 ]; then
|
||||
print_success "All certificates are valid - no renewals needed"
|
||||
fi
|
||||
|
||||
|
||||
#########################
|
||||
# Final checks
|
||||
#########################
|
||||
|
||||
# Wait for the certificates to be issued (with a timeout)
|
||||
print_info "Waiting for wildcard certificates to be ready (this may take several minutes)..."
|
||||
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true
|
||||
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true
|
||||
|
||||
# Final health check
|
||||
print_info "Performing final cert-manager health check..."
|
||||
failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
|
||||
if [ "$failed_certs" -gt 0 ]; then
|
||||
print_warning "Found $failed_certs certificates not in Ready state"
|
||||
print_info "Check certificate status with: kubectl get certificates --all-namespaces"
|
||||
print_info "Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
|
||||
else
|
||||
print_success "All certificates are in Ready state"
|
||||
fi
|
||||
|
||||
print_success "cert-manager setup complete!"
|
||||
echo ""
|
||||
print_info "To verify the installation:"
|
||||
print_info " kubectl get pods -n cert-manager"
|
||||
print_info " kubectl get clusterissuers"
|
||||
print_info " kubectl get certificates -n cert-manager"
|
||||
|
Reference in New Issue
Block a user