#!/bin/bash
set -e
set -o pipefail

# Initialize Wild Cloud environment
if [ -z "${WC_ROOT}" ]; then
    print "WC_ROOT is not set."
    exit 1
else
    source "${WC_ROOT}/scripts/common.sh"
    init_wild_env
fi

CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services"
CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"

print_header "Setting up cert-manager"

#######################
# # Dependencies
#######################

# Check Traefik dependency
print_info "Verifying Traefik is ready (required for cert-manager)..."
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
    print_warning "Traefik not ready, but continuing with cert-manager installation"
    print_info "Note: cert-manager may not work properly without Traefik"
}

if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
    print_error "Compiled templates not found. This script should not be run directly. Run with 'wild setup cluster-services cert-manager' instead."
    exit 1
fi

# Validate DNS resolution using temporary test pod
print_info "Validating DNS resolution for ACME challenges..."
domain=$(wild-config cluster.certManager.cloudflare.domain)
print_info "Testing DNS resolution for domain: $domain"

# Create temporary pod with DNS utilities (in default namespace since cert-manager doesn't exist yet)
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -- \
    nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
    print_success "DNS resolution working for ACME challenges" || \
    print_warning "DNS resolution issues may affect ACME challenges"


########################
# Cloudflare DNS setup
########################

# API token secret setup
print_info "Reading Cloudflare API token secret..."
CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1
if [ -z "$CLOUDFLARE_API_TOKEN" ]; then
    print_error "Cloudflare API token not found. Please create it with 'wild secret create cloudflare.token'."
    exit 1
fi

# Validate token
print_info "Validating Cloudflare API token permissions..."
validate_cloudflare_token() {
    local token="$1"
    if ! command -v curl &>/dev/null; then
        print_warning "curl not available, skipping token validation"
        return 0
    fi

    print_info "Testing Cloudflare API token..."
    local response
    response=$(curl -s -H "Authorization: Bearer $token" \
                   "https://api.cloudflare.com/client/v4/zones")

    if echo "$response" | grep -q '"success":true'; then
        print_success "Cloudflare API token is valid and has zone access"
        return 0
    else
        print_error "Cloudflare token validation failed"
        print_info "Response: $response"
        print_info "Please ensure your token has Zone - Zone - Read permission"
        return 1
    fi
}

validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || {
    print_error "Cloudflare token validation failed. Please check token permissions."
    print_info "Required permissions: Zone - Zone - Read, Zone - DNS - Edit"
    exit 1
}

########################
# Kubernetes components
########################

print_info "Installing cert-manager components..."
# Using stable URL for cert-manager installation
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
  kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml

# Wait for cert-manager to be ready
print_info "Waiting for cert-manager to be ready..."
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s

# Now that cert-manager namespace exists, create the Cloudflare API token secret
print_info "Creating Cloudflare API token secret..."
kubectl create secret generic cloudflare-api-token \
  --namespace cert-manager \
  --from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
  --dry-run=client -o yaml | kubectl apply -f -

# Ensure webhook is fully operational
print_info "Verifying cert-manager webhook is fully operational..."
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
    print_info "Waiting for cert-manager webhook to register..."
    sleep 5
done

# Test webhook connectivity before proceeding
print_info "Testing webhook connectivity..."
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager

# Configure cert-manager to use external DNS for challenge verification
print_info "Configuring cert-manager to use external DNS servers..."
kubectl patch deployment cert-manager -n cert-manager --patch '
spec:
  template:
    spec:
      dnsPolicy: None
      dnsConfig:
        nameservers:
          - "1.1.1.1"
          - "8.8.8.8"
        searches:
          - cert-manager.svc.cluster.local
          - svc.cluster.local
          - cluster.local
        options:
          - name: ndots
            value: "5"'

# Wait for cert-manager to restart with new DNS config
print_info "Waiting for cert-manager to restart with new DNS configuration..."
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s

########################
# Create issuers and certificates
########################

# Apply Let's Encrypt issuers and certificates using kustomize
print_info "Creating Let's Encrypt issuers and certificates..."
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize

# Wait for issuers to be ready
print_info "Waiting for Let's Encrypt issuers to be ready..."
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || print_warning "Production issuer not ready, proceeding anyway..."
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || print_warning "Staging issuer not ready, proceeding anyway..."

# Give cert-manager a moment to process the certificates
sleep 5

######################################
# Fix stuck certificates and cleanup
######################################

needs_restart=false

# STEP 1: Fix certificates stuck with 404 errors FIRST (before cleaning up orders)
print_info "Checking for certificates with failed issuance attempts..."
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
    jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')

if [ -n "$stuck_certs" ]; then
    print_warning "Found certificates stuck with non-existent orders, recreating them..."
    echo "$stuck_certs" | while read ns name; do
        print_info "Recreating certificate $ns/$name..."
        # Get just the spec
        cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
        # Delete the certificate
        kubectl delete certificate "$name" -n "$ns"
        # Recreate with clean state
        echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
    done
    needs_restart=true
    # Give cert-manager time to process the recreated certificates
    sleep 5
else
    print_success "No certificates stuck with failed orders"
fi

# STEP 2: Clean up orphaned orders (after fixing certificates)
print_info "Checking for orphaned ACME orders..."

# Check logs for 404 errors
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
    grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
    sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
    sort -u || true)

if [ -n "$orphaned_orders" ]; then
    print_warning "Found orphaned ACME orders from logs"
    for order in $orphaned_orders; do
        print_info "Deleting orphaned order: $order"
        # Find and delete the order in whatever namespace it exists
        orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
        if [ -n "$orders_found" ]; then
            echo "$orders_found" | while read ns name rest; do
                kubectl delete order "$name" -n "$ns" 2>/dev/null || true
            done
        fi
    done
    needs_restart=true
else
    print_success "No orphaned orders found in logs"
fi

# Check for errored state orders
errored_orders=$(kubectl get orders --all-namespaces -o json 2>/dev/null | \
    jq -r '.items[] | select(.status.state == "errored") | "\(.metadata.namespace) \(.metadata.name)"')

if [ -n "$errored_orders" ]; then
    print_warning "Found errored ACME orders"
    echo "$errored_orders" | while read ns name; do
        print_info "Deleting errored order: $ns/$name"
        kubectl delete order "$name" -n "$ns" 2>/dev/null || true
    done
    needs_restart=true
else
    print_success "No errored orders found"
fi

# STEP 3: Clean up bad challenges
print_info "Checking for stuck ACME challenges..."

# Delete expired, invalid, or errored challenges
bad_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | \
    jq -r '.items[] | select(.status.state == "expired" or .status.state == "invalid" or .status.state == "errored") | "\(.metadata.namespace) \(.metadata.name) \(.status.state)"')

if [ -n "$bad_challenges" ]; then
    print_warning "Found stuck ACME challenges"
    echo "$bad_challenges" | while read ns name state; do
        print_info "Deleting $state challenge: $ns/$name"
        kubectl delete challenge "$name" -n "$ns" 2>/dev/null || true
    done
    needs_restart=true
else
    print_success "No stuck challenges found"
fi

# Delete very old challenges (over 1 hour) - only if they exist
all_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | jq '.items | length' || echo 0)
if [ "$all_challenges" -gt 0 ]; then
    old_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | \
        jq -r --arg cutoff "$(date -u -d '1 hour ago' '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || date -u -v-1H '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null)" \
        '.items[] | select(.metadata.creationTimestamp < $cutoff) | "\(.metadata.namespace) \(.metadata.name)"')

    if [ -n "$old_challenges" ]; then
        print_warning "Found old challenges (over 1 hour)"
        echo "$old_challenges" | while read ns name; do
            print_info "Deleting old challenge: $ns/$name"
            kubectl delete challenge "$name" -n "$ns" 2>/dev/null || true
        done
        needs_restart=true
    fi
fi

# STEP 4: Check for DNS errors
dns_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=50 2>/dev/null | \
    grep "Could not route to /client/v4/zones/dns_records" | wc -l | tr -d '\n' || echo "0")
dns_errors=${dns_errors:-0}

if [ "$dns_errors" -gt 0 ]; then
    print_warning "Cert-manager has DNS record cleanup errors"
    needs_restart=true
fi

# STEP 5: Single restart if anything needs cleaning
if [ "$needs_restart" = true ]; then
    print_info "Restarting cert-manager once to clear all internal state..."
    kubectl rollout restart deployment cert-manager -n cert-manager
    kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
    # Give cert-manager time to reinitialize
    sleep 10
else
    print_success "No restart needed - cert-manager state is clean"
fi


##################################
# Handle certificate renewal
##################################

# Check for expired or near-expiry certificates and trigger renewal
print_info "Checking certificate expiration status..."
current_date=$(date +%s)

# Track if we found any issues
found_expired=false
found_expiring_soon=false
all_certs_valid=true

# Process certificates and collect their status
while IFS= read -r line; do
    ns=$(echo "$line" | awk '{print $1}')
    name=$(echo "$line" | awk '{print $2}')
    secret=$(echo "$line" | awk '{print $3}')
    expiry=$(echo "$line" | awk '{print $4}')

    if [ "$expiry" != "unknown" ] && [ "$expiry" != "null" ] && [ "$expiry" != "" ]; then
        expiry_ts=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$expiry" +%s 2>/dev/null || echo 0)
        if [ "$expiry_ts" -gt 0 ]; then
            days_until_expiry=$(( (expiry_ts - current_date) / 86400 ))

            if [ "$days_until_expiry" -lt 0 ]; then
                print_warning "Certificate $ns/$name has EXPIRED (expired ${days_until_expiry#-} days ago)"
                if [ -n "$secret" ] && [ "$secret" != "unknown" ] && [ "$secret" != "null" ]; then
                    print_info "Deleting secret $secret to trigger renewal..."
                    kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true
                    found_expired=true
                    all_certs_valid=false
                fi
            elif [ "$days_until_expiry" -lt 7 ]; then
                print_warning "Certificate $ns/$name expires in $days_until_expiry days"
                if [ "$days_until_expiry" -lt 3 ]; then
                    # Force renewal for certificates expiring very soon
                    if [ -n "$secret" ] && [ "$secret" != "unknown" ] && [ "$secret" != "null" ]; then
                        print_info "Forcing renewal by deleting secret $secret..."
                        kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true
                        found_expiring_soon=true
                        all_certs_valid=false
                    fi
                else
                    print_info "Will renew automatically when closer to expiry"
                fi
            elif [ "$days_until_expiry" -lt 30 ]; then
                print_info "Certificate $ns/$name expires in $days_until_expiry days (renewal not needed yet)"
            else
                print_success "Certificate $ns/$name is valid for $days_until_expiry days"
            fi
        fi
    else
        # Certificate has no expiry (being issued)
        print_info "Certificate $ns/$name is currently being issued..."
    fi
done < <(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name) \(.spec.secretName) \(.status.notAfter // "unknown")"')

if [ "$all_certs_valid" = true ]; then
    print_success "All certificates are valid - no renewals needed"
fi


#########################
# Final checks
#########################

# Wait for the certificates to be issued (with a timeout)
print_info "Waiting for wildcard certificates to be ready (this may take several minutes)..."
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true

# Final health check
print_info "Performing final cert-manager health check..."
failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
if [ "$failed_certs" -gt 0 ]; then
    print_warning "Found $failed_certs certificates not in Ready state"
    print_info "Check certificate status with: kubectl get certificates --all-namespaces"
    print_info "Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
else
    print_success "All certificates are in Ready state"
fi

print_success "cert-manager setup complete!"