Reorganized for new stable/waypoint versioning design.

This commit is contained in:
2026-05-24 18:28:47 +00:00
parent 945d2225a2
commit bc7a168851
352 changed files with 1264 additions and 294 deletions

View File

@@ -0,0 +1,20 @@
# cert-manager
X.509 certificate management for Kubernetes using Let's Encrypt.
## Upstream
The `upstream/cert-manager.yaml` file is downloaded from the official cert-manager release:
- Source: https://github.com/cert-manager/cert-manager/releases/download/v1.17.2/cert-manager.yaml
- Version: v1.17.2
To update, download the new version and replace the file.
## DNS Configuration
The upstream cert-manager deployment is patched via kustomize overlay (`upstream/kustomization.yaml`) to use external DNS resolvers (1.1.1.1, 8.8.8.8) instead of cluster DNS. This is required for ACME DNS-01 challenge verification.
## Maintenance
The `scripts/repair-certificates.sh` script can fix stuck certificates, orphaned ACME orders, and Cloudflare DNS cleanup errors. Run it manually when certificate issuance has issues.

View File

@@ -0,0 +1,19 @@
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: wildcard-internal-wild-cloud
namespace: cert-manager
spec:
secretName: wildcard-internal-wild-cloud-tls
dnsNames:
- "*.{{ .internalDomain }}"
- "{{ .internalDomain }}"
issuerRef:
name: letsencrypt-prod
kind: ClusterIssuer
duration: 2160h # 90 days
renewBefore: 360h # 15 days
privateKey:
algorithm: RSA
size: 2048

View File

@@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- letsencrypt-staging-dns01.yaml
- letsencrypt-prod-dns01.yaml
- internal-wildcard-certificate.yaml
- wildcard-certificate.yaml

View File

@@ -0,0 +1,25 @@
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: {{ .email }}
privateKeySecretRef:
name: letsencrypt-prod
server: https://acme-v02.api.letsencrypt.org/directory
solvers:
# DNS-01 solver for wildcard certificates
- dns01:
cloudflare:
apiTokenSecretRef:
name: cloudflare-api-token
key: api-token
selector:
dnsZones:
- "{{ .cloudflareDomain }}"
# Keep the HTTP-01 solver for non-wildcard certificates
- http01:
ingress:
class: traefik

View File

@@ -0,0 +1,25 @@
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-staging
spec:
acme:
email: {{ .email }}
privateKeySecretRef:
name: letsencrypt-staging
server: https://acme-staging-v02.api.letsencrypt.org/directory
solvers:
# DNS-01 solver for wildcard certificates
- dns01:
cloudflare:
apiTokenSecretRef:
name: cloudflare-api-token
key: api-token
selector:
dnsZones:
- "{{ .cloudflareDomain }}"
# Keep the HTTP-01 solver for non-wildcard certificates
- http01:
ingress:
class: traefik

View File

@@ -0,0 +1,26 @@
version: v1.17.2
requires:
- name: traefik
defaultConfig:
namespace: cert-manager
cloudDomain: "{{ .cloud.domain }}"
internalDomain: "{{ .cloud.internalDomain }}"
email: "{{ .operator.email }}"
cloudflareDomain: "{{ .cloud.baseDomain }}"
scripts:
- name: repair-certificates
path: scripts/repair-certificates.sh
description: Fix stuck certificates, orphaned ACME orders, and Cloudflare DNS cleanup errors
defaultSecrets:
- key: cloudflareToken
deploy:
phases:
- path: upstream
waitFor:
name: cert-manager-webhook
timeout: "120s"
- path: .
createSecrets:
- name: cloudflare-api-token
entries:
api-token: cloudflareToken

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: "{{ .namespace }}"

View File

@@ -0,0 +1,89 @@
#!/bin/bash
# Repair stuck certificates, orphaned ACME orders, and Cloudflare DNS errors.
# This is an operational maintenance script, not part of deployment.
# Run manually when cert-manager has issues with certificate issuance.
#
# Usage: KUBECONFIG=/path/to/kubeconfig ./repair-certificates.sh
set -e
set -o pipefail
if [ -z "${KUBECONFIG}" ]; then
echo "ERROR: KUBECONFIG is not set"
exit 1
fi
needs_restart=false
echo "=== cert-manager Certificate Repair ==="
echo ""
echo "Checking for certificates with failed issuance attempts..."
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
if [ -n "$stuck_certs" ]; then
echo "WARNING: Found certificates stuck with non-existent orders, recreating them..."
echo "$stuck_certs" | while read ns name; do
echo "Recreating certificate $ns/$name..."
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
kubectl delete certificate "$name" -n "$ns"
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
done
needs_restart=true
sleep 5
else
echo "No certificates stuck with failed orders"
fi
echo "Checking for orphaned ACME orders..."
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
sort -u || true)
if [ -n "$orphaned_orders" ]; then
echo "WARNING: Found orphaned ACME orders from logs"
for order in $orphaned_orders; do
echo "Deleting orphaned order: $order"
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
if [ -n "$orders_found" ]; then
echo "$orders_found" | while read ns name rest; do
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
done
fi
done
needs_restart=true
else
echo "No orphaned orders found in logs"
fi
echo "Checking for Cloudflare DNS cleanup errors..."
cloudflare_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
grep -c "Error: 7003.*Could not route" 2>/dev/null || echo "0")
if [ "$cloudflare_errors" -gt "0" ]; then
echo "WARNING: Found $cloudflare_errors Cloudflare DNS cleanup errors (stale DNS record references)"
echo "Deleting stuck challenges and orders to allow fresh start"
kubectl delete challenges --all -n cert-manager 2>/dev/null || true
kubectl delete orders --all -n cert-manager 2>/dev/null || true
needs_restart=true
else
echo "No Cloudflare DNS cleanup errors"
fi
if [ "$needs_restart" = true ]; then
echo "Restarting cert-manager to clear internal state..."
kubectl rollout restart deployment cert-manager -n cert-manager
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
echo "Waiting for cert-manager to recreate fresh challenges..."
sleep 15
else
echo "No restart needed - cert-manager state is clean"
fi
echo ""
echo "Repair complete. Check certificate status with:"
echo " kubectl get certificates --all-namespaces"
echo " kubectl get clusterissuers"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- cert-manager.yaml
patches:
- target:
kind: Deployment
name: cert-manager
namespace: cert-manager
patch: |-
apiVersion: apps/v1
kind: Deployment
metadata:
name: cert-manager
namespace: cert-manager
spec:
template:
spec:
dnsPolicy: None
dnsConfig:
nameservers:
- "1.1.1.1"
- "8.8.8.8"
searches:
- cert-manager.svc.cluster.local
- svc.cluster.local
- cluster.local
options:
- name: ndots
value: "5"

View File

@@ -0,0 +1,19 @@
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: wildcard-wild-cloud
namespace: cert-manager
spec:
secretName: wildcard-wild-cloud-tls
dnsNames:
- "*.{{ .cloudDomain }}"
- "{{ .cloudDomain }}"
issuerRef:
name: letsencrypt-prod
kind: ClusterIssuer
duration: 2160h # 90 days
renewBefore: 360h # 15 days
privateKey:
algorithm: RSA
size: 2048