Compare commits
2 Commits
e4c24d4a8c
...
5733c20098
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5733c20098 | ||
|
|
54abfdd469 |
@@ -1 +1,20 @@
|
|||||||
|
# cert-manager
|
||||||
|
|
||||||
|
X.509 certificate management for Kubernetes using Let's Encrypt.
|
||||||
|
|
||||||
|
## Upstream
|
||||||
|
|
||||||
|
The `upstream/cert-manager.yaml` file is downloaded from the official cert-manager release:
|
||||||
|
|
||||||
|
- Source: https://github.com/cert-manager/cert-manager/releases/download/v1.17.2/cert-manager.yaml
|
||||||
|
- Version: v1.17.2
|
||||||
|
|
||||||
|
To update, download the new version and replace the file.
|
||||||
|
|
||||||
|
## DNS Configuration
|
||||||
|
|
||||||
|
The upstream cert-manager deployment is patched via kustomize overlay (`upstream/kustomization.yaml`) to use external DNS resolvers (1.1.1.1, 8.8.8.8) instead of cluster DNS. This is required for ACME DNS-01 challenge verification.
|
||||||
|
|
||||||
|
## Maintenance
|
||||||
|
|
||||||
|
The `scripts/repair-certificates.sh` script can fix stuck certificates, orphaned ACME orders, and Cloudflare DNS cleanup errors. Run it manually when certificate issuance has issues.
|
||||||
|
|||||||
@@ -1,233 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
if [ -z "${WILD_INSTANCE}" ]; then
|
|
||||||
echo "ERROR: WILD_INSTANCE is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "${WILD_API_DATA_DIR}" ]; then
|
|
||||||
echo "ERROR: WILD_API_DATA_DIR is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "${KUBECONFIG}" ]; then
|
|
||||||
echo "ERROR: KUBECONFIG is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
INSTANCE_DIR="${WILD_API_DATA_DIR}/instances/${WILD_INSTANCE}"
|
|
||||||
CERT_MANAGER_DIR="${INSTANCE_DIR}/apps/cert-manager"
|
|
||||||
|
|
||||||
echo "=== Setting up cert-manager ==="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
#######################
|
|
||||||
# Dependencies
|
|
||||||
#######################
|
|
||||||
|
|
||||||
echo "Verifying Traefik is ready (required for cert-manager)..."
|
|
||||||
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
|
|
||||||
echo "WARNING: Traefik not ready, but continuing with cert-manager installation"
|
|
||||||
echo "Note: cert-manager may not work properly without Traefik"
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ ! -f "${CERT_MANAGER_DIR}/kustomization.yaml" ]; then
|
|
||||||
echo "ERROR: Compiled templates not found at ${CERT_MANAGER_DIR}/"
|
|
||||||
echo "Templates should be compiled before deployment."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Kubernetes components
|
|
||||||
########################
|
|
||||||
|
|
||||||
echo "Installing cert-manager components..."
|
|
||||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.17.2/cert-manager.yaml || \
|
|
||||||
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.17.2/cert-manager.yaml
|
|
||||||
|
|
||||||
echo "Waiting for cert-manager to be ready..."
|
|
||||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
|
|
||||||
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
|
|
||||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
|
|
||||||
|
|
||||||
echo "Creating Cloudflare API token secret..."
|
|
||||||
SECRETS_FILE="${WILD_API_DATA_DIR}/instances/${WILD_INSTANCE}/secrets.yaml"
|
|
||||||
CLOUDFLARE_API_TOKEN=$(yq '.apps.cert-manager.cloudflareToken' "$SECRETS_FILE" 2>/dev/null)
|
|
||||||
|
|
||||||
CLOUDFLARE_API_TOKEN=$(echo "$CLOUDFLARE_API_TOKEN")
|
|
||||||
if [ -z "$CLOUDFLARE_API_TOKEN" ] || [ "$CLOUDFLARE_API_TOKEN" = "null" ]; then
|
|
||||||
echo "ERROR: Cloudflare API token not found"
|
|
||||||
echo "Please set: apps.cert-manager.cloudflareToken in secrets.yaml"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
kubectl create secret generic cloudflare-api-token \
|
|
||||||
--namespace cert-manager \
|
|
||||||
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
|
|
||||||
--dry-run=client -o yaml | kubectl apply -f -
|
|
||||||
|
|
||||||
echo "Verifying cert-manager webhook is fully operational..."
|
|
||||||
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
|
|
||||||
echo "Waiting for cert-manager webhook to register..."
|
|
||||||
sleep 5
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Configuring cert-manager to use external DNS servers..."
|
|
||||||
kubectl patch deployment cert-manager -n cert-manager --patch '
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
dnsPolicy: None
|
|
||||||
dnsConfig:
|
|
||||||
nameservers:
|
|
||||||
- "1.1.1.1"
|
|
||||||
- "8.8.8.8"
|
|
||||||
searches:
|
|
||||||
- cert-manager.svc.cluster.local
|
|
||||||
- svc.cluster.local
|
|
||||||
- cluster.local
|
|
||||||
options:
|
|
||||||
- name: ndots
|
|
||||||
value: "5"'
|
|
||||||
|
|
||||||
echo "Waiting for cert-manager to restart with new DNS configuration..."
|
|
||||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Create issuers and certificates
|
|
||||||
########################
|
|
||||||
|
|
||||||
echo "Creating Let's Encrypt issuers and certificates..."
|
|
||||||
kubectl apply -k ${CERT_MANAGER_DIR}/
|
|
||||||
|
|
||||||
echo "Waiting for Let's Encrypt issuers to be ready..."
|
|
||||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || echo "WARNING: Production issuer not ready, proceeding anyway..."
|
|
||||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || echo "WARNING: Staging issuer not ready, proceeding anyway..."
|
|
||||||
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
######################################
|
|
||||||
# Fix stuck certificates and cleanup
|
|
||||||
######################################
|
|
||||||
|
|
||||||
needs_restart=false
|
|
||||||
|
|
||||||
echo "Checking for certificates with failed issuance attempts..."
|
|
||||||
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
|
|
||||||
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
|
|
||||||
|
|
||||||
if [ -n "$stuck_certs" ]; then
|
|
||||||
echo "WARNING: Found certificates stuck with non-existent orders, recreating them..."
|
|
||||||
echo "$stuck_certs" | while read ns name; do
|
|
||||||
echo "Recreating certificate $ns/$name..."
|
|
||||||
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
|
|
||||||
kubectl delete certificate "$name" -n "$ns"
|
|
||||||
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
|
|
||||||
done
|
|
||||||
needs_restart=true
|
|
||||||
sleep 5
|
|
||||||
else
|
|
||||||
echo "No certificates stuck with failed orders"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Checking for orphaned ACME orders..."
|
|
||||||
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
|
||||||
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
|
|
||||||
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
|
|
||||||
sort -u || true)
|
|
||||||
|
|
||||||
if [ -n "$orphaned_orders" ]; then
|
|
||||||
echo "WARNING: Found orphaned ACME orders from logs"
|
|
||||||
for order in $orphaned_orders; do
|
|
||||||
echo "Deleting orphaned order: $order"
|
|
||||||
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
|
|
||||||
if [ -n "$orders_found" ]; then
|
|
||||||
echo "$orders_found" | while read ns name rest; do
|
|
||||||
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
needs_restart=true
|
|
||||||
else
|
|
||||||
echo "No orphaned orders found in logs"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Checking for Cloudflare DNS cleanup errors..."
|
|
||||||
cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
|
||||||
grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
|
|
||||||
|
|
||||||
if [ "$cloudflare_errors" -gt "0" ]; then
|
|
||||||
echo "WARNING: Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
|
|
||||||
echo "Deleting stuck challenges and orders to allow fresh start"
|
|
||||||
|
|
||||||
kubectl delete challenges --all -n cert-manager 2>/dev/null || true
|
|
||||||
kubectl delete orders --all -n cert-manager 2>/dev/null || true
|
|
||||||
|
|
||||||
needs_restart=true
|
|
||||||
else
|
|
||||||
echo "No Cloudflare DNS cleanup errors"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$needs_restart" = true ]; then
|
|
||||||
echo "Restarting cert-manager to clear internal state..."
|
|
||||||
kubectl rollout restart deployment cert-manager -n cert-manager
|
|
||||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
|
||||||
echo "Waiting for cert-manager to recreate fresh challenges..."
|
|
||||||
sleep 15
|
|
||||||
else
|
|
||||||
echo "No restart needed - cert-manager state is clean"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#########################
|
|
||||||
# Final checks
|
|
||||||
#########################
|
|
||||||
|
|
||||||
echo "Waiting for wildcard certificates to be ready (this may take several minutes)..."
|
|
||||||
|
|
||||||
wait_for_cert() {
|
|
||||||
local cert_name="$1"
|
|
||||||
local timeout=300
|
|
||||||
local elapsed=0
|
|
||||||
|
|
||||||
echo " Checking $cert_name..."
|
|
||||||
|
|
||||||
while [ $elapsed -lt $timeout ]; do
|
|
||||||
if kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null | grep -q "True"; then
|
|
||||||
echo " $cert_name is ready"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then
|
|
||||||
local status=$(kubectl get certificate "$cert_name" -n cert-manager -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "Waiting...")
|
|
||||||
echo " Still waiting for $cert_name... ($elapsed/${timeout}s) - $status"
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep 5
|
|
||||||
elapsed=$((elapsed + 5))
|
|
||||||
done
|
|
||||||
|
|
||||||
echo " WARNING: Timeout waiting for $cert_name (will continue anyway)"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_cert "wildcard-internal-wild-cloud"
|
|
||||||
wait_for_cert "wildcard-wild-cloud"
|
|
||||||
|
|
||||||
echo "Performing final cert-manager health check..."
|
|
||||||
failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
|
|
||||||
if [ "$failed_certs" -gt 0 ]; then
|
|
||||||
echo "WARNING: Found $failed_certs certificates not in Ready state"
|
|
||||||
echo "Check certificate status with: kubectl get certificates --all-namespaces"
|
|
||||||
echo "Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
|
|
||||||
else
|
|
||||||
echo "All certificates are in Ready state"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "cert-manager setup complete!"
|
|
||||||
echo ""
|
|
||||||
echo "To verify the installation:"
|
|
||||||
echo " kubectl get certificates --all-namespaces"
|
|
||||||
echo " kubectl get clusterissuers"
|
|
||||||
@@ -11,5 +11,20 @@ defaultConfig:
|
|||||||
internalDomain: "{{ .cloud.internalDomain }}"
|
internalDomain: "{{ .cloud.internalDomain }}"
|
||||||
email: "{{ .operator.email }}"
|
email: "{{ .operator.email }}"
|
||||||
cloudflareDomain: "{{ .cloud.baseDomain }}"
|
cloudflareDomain: "{{ .cloud.baseDomain }}"
|
||||||
|
scripts:
|
||||||
|
- name: repair-certificates
|
||||||
|
path: scripts/repair-certificates.sh
|
||||||
|
description: Fix stuck certificates, orphaned ACME orders, and Cloudflare DNS cleanup errors
|
||||||
defaultSecrets:
|
defaultSecrets:
|
||||||
- key: cloudflareToken
|
- key: cloudflareToken
|
||||||
|
deploy:
|
||||||
|
phases:
|
||||||
|
- path: upstream
|
||||||
|
waitFor:
|
||||||
|
name: cert-manager-webhook
|
||||||
|
timeout: "120s"
|
||||||
|
- path: .
|
||||||
|
createSecrets:
|
||||||
|
- name: cloudflare-api-token
|
||||||
|
entries:
|
||||||
|
api-token: cloudflareToken
|
||||||
|
|||||||
89
cert-manager/scripts/repair-certificates.sh
Executable file
89
cert-manager/scripts/repair-certificates.sh
Executable file
@@ -0,0 +1,89 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Repair stuck certificates, orphaned ACME orders, and Cloudflare DNS errors.
|
||||||
|
# This is an operational maintenance script, not part of deployment.
|
||||||
|
# Run manually when cert-manager has issues with certificate issuance.
|
||||||
|
#
|
||||||
|
# Usage: KUBECONFIG=/path/to/kubeconfig ./repair-certificates.sh
|
||||||
|
set -e
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
if [ -z "${KUBECONFIG}" ]; then
|
||||||
|
echo "ERROR: KUBECONFIG is not set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
needs_restart=false
|
||||||
|
|
||||||
|
echo "=== cert-manager Certificate Repair ==="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "Checking for certificates with failed issuance attempts..."
|
||||||
|
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
|
||||||
|
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
|
||||||
|
|
||||||
|
if [ -n "$stuck_certs" ]; then
|
||||||
|
echo "WARNING: Found certificates stuck with non-existent orders, recreating them..."
|
||||||
|
echo "$stuck_certs" | while read ns name; do
|
||||||
|
echo "Recreating certificate $ns/$name..."
|
||||||
|
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
|
||||||
|
kubectl delete certificate "$name" -n "$ns"
|
||||||
|
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
|
||||||
|
done
|
||||||
|
needs_restart=true
|
||||||
|
sleep 5
|
||||||
|
else
|
||||||
|
echo "No certificates stuck with failed orders"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Checking for orphaned ACME orders..."
|
||||||
|
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
||||||
|
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
|
||||||
|
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
|
||||||
|
sort -u || true)
|
||||||
|
|
||||||
|
if [ -n "$orphaned_orders" ]; then
|
||||||
|
echo "WARNING: Found orphaned ACME orders from logs"
|
||||||
|
for order in $orphaned_orders; do
|
||||||
|
echo "Deleting orphaned order: $order"
|
||||||
|
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
|
||||||
|
if [ -n "$orders_found" ]; then
|
||||||
|
echo "$orders_found" | while read ns name rest; do
|
||||||
|
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
needs_restart=true
|
||||||
|
else
|
||||||
|
echo "No orphaned orders found in logs"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Checking for Cloudflare DNS cleanup errors..."
|
||||||
|
cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
||||||
|
grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
if [ "$cloudflare_errors" -gt "0" ]; then
|
||||||
|
echo "WARNING: Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
|
||||||
|
echo "Deleting stuck challenges and orders to allow fresh start"
|
||||||
|
|
||||||
|
kubectl delete challenges --all -n cert-manager 2>/dev/null || true
|
||||||
|
kubectl delete orders --all -n cert-manager 2>/dev/null || true
|
||||||
|
|
||||||
|
needs_restart=true
|
||||||
|
else
|
||||||
|
echo "No Cloudflare DNS cleanup errors"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$needs_restart" = true ]; then
|
||||||
|
echo "Restarting cert-manager to clear internal state..."
|
||||||
|
kubectl rollout restart deployment cert-manager -n cert-manager
|
||||||
|
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||||
|
echo "Waiting for cert-manager to recreate fresh challenges..."
|
||||||
|
sleep 15
|
||||||
|
else
|
||||||
|
echo "No restart needed - cert-manager state is clean"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Repair complete. Check certificate status with:"
|
||||||
|
echo " kubectl get certificates --all-namespaces"
|
||||||
|
echo " kubectl get clusterissuers"
|
||||||
13286
cert-manager/upstream/cert-manager.yaml
Normal file
13286
cert-manager/upstream/cert-manager.yaml
Normal file
File diff suppressed because it is too large
Load Diff
30
cert-manager/upstream/kustomization.yaml
Normal file
30
cert-manager/upstream/kustomization.yaml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- cert-manager.yaml
|
||||||
|
patches:
|
||||||
|
- target:
|
||||||
|
kind: Deployment
|
||||||
|
name: cert-manager
|
||||||
|
namespace: cert-manager
|
||||||
|
patch: |-
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: cert-manager
|
||||||
|
namespace: cert-manager
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
dnsPolicy: None
|
||||||
|
dnsConfig:
|
||||||
|
nameservers:
|
||||||
|
- "1.1.1.1"
|
||||||
|
- "8.8.8.8"
|
||||||
|
searches:
|
||||||
|
- cert-manager.svc.cluster.local
|
||||||
|
- svc.cluster.local
|
||||||
|
- cluster.local
|
||||||
|
options:
|
||||||
|
- name: ndots
|
||||||
|
value: "5"
|
||||||
Reference in New Issue
Block a user