wild-cloud/bin/wild-health.sh

#!/bin/bash
set -e

# Initialize Wild Cloud environment
if [ -z "${WC_ROOT}" ]; then
    print "WC_ROOT is not set."
    exit 1
else
    source "${WC_ROOT}/scripts/common.sh"
    init_wild_env
fi

# Navigate to script directory
SCRIPT_PATH="$(realpath "${BASH_SOURCE[0]}")"
SCRIPT_DIR="$(dirname "$SCRIPT_PATH")"
ROOT_DIR="$(dirname "$SCRIPT_DIR")"
cd "$SCRIPT_DIR"

# Define colors for better readability
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
RED='\033[0;31m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m' # No Color

# Get configuration from wild-config
DOMAIN=$(wild-config cloud.domain)
INTERNAL_DOMAIN=$(wild-config cloud.internalDomain)
OPERATOR_EMAIL=$(wild-config operator.email)
DNS_IP=$(wild-config cloud.dns.ip)
ROUTER_IP=$(wild-config cloud.router.ip)

# Validate required configuration
if [[ -z "$DOMAIN" || -z "$INTERNAL_DOMAIN" ]]; then
    echo "Error: Unable to get domain configuration from wild-config"
    echo "Please ensure your config.yaml is properly configured"
    exit 1
fi

# Array to collect issues we found
declare -a ISSUES_FOUND

echo -e "${BLUE}============================================================${NC}"
echo -e "${BLUE}           Validating Infrastructure Setup                  ${NC}"
echo -e "${BLUE}============================================================${NC}"

# Display a summary of what will be validated
echo -e "${CYAN}This script will validate the following components:${NC}"
echo -e "• ${YELLOW}Core components:${NC} MetalLB, Traefik, CoreDNS (Talos/Kubernetes components)"
echo -e "• ${YELLOW}Installed components:${NC} cert-manager, ExternalDNS, Kubernetes Dashboard, Longhorn"
echo -e "• ${YELLOW}DNS resolution:${NC} Internal domain names and dashboard access"
echo -e "• ${YELLOW}Routing:${NC} IngressRoutes, middlewares, and services"
echo -e "• ${YELLOW}Authentication:${NC} Service accounts and tokens"
echo -e "• ${YELLOW}Storage:${NC} Longhorn storage system and persistent volumes"
echo -e "• ${YELLOW}Load balancing:${NC} IP address pools and allocations"
echo -e "• ${YELLOW}Certificates:${NC} Let's Encrypt wildcard certificates"
echo
echo -e "${CYAN}The validation will create a test pod 'validation-test' that will remain running${NC}"
echo -e "${CYAN}after the script finishes, for further troubleshooting if needed.${NC}"
echo

# Check if test pod exists and create if it doesn't
if kubectl get pod validation-test &>/dev/null; then
  echo -e "${YELLOW}Validation test pod already exists, using existing pod...${NC}"
  # Check if the pod is running
  POD_STATUS=$(kubectl get pod validation-test -o jsonpath='{.status.phase}')
  if [[ "$POD_STATUS" != "Running" ]]; then
    echo -e "${YELLOW}Pod exists but is in $POD_STATUS state. Recreating it...${NC}"
    kubectl delete pod validation-test --ignore-not-found
    echo -e "${YELLOW}Creating temporary test pod for validation...${NC}"
    kubectl run validation-test --image=nicolaka/netshoot --restart=Never -- sleep 3600
  fi
else
  echo -e "${YELLOW}Creating temporary test pod for validation...${NC}"
  kubectl run validation-test --image=nicolaka/netshoot --restart=Never -- sleep 3600
fi

# Wait for test pod to be ready
echo -e "${YELLOW}Waiting for test pod to be ready...${NC}"
kubectl wait --for=condition=Ready pod/validation-test --timeout=60s || {
  echo -e "${RED}Failed to create test pod. Validation cannot continue.${NC}"
  exit 1
}

echo

# Function to check if a component is running
check_component() {
  local component_name=$1
  local namespace=$2
  local selector=$3

  echo -e "${YELLOW}Checking ${component_name} in namespace ${namespace}...${NC}"

  local pods=$(kubectl get pods -n "${namespace}" -l "${selector}" -o name 2>/dev/null || echo "")
  if [[ -n "$pods" ]]; then
    echo -e "  ${GREEN}✓ ${component_name} pods are running${NC}"

    # Check if all pods are in Running state and Ready
    # Using a simpler approach to avoid complex jsonpath issues
    local not_ready=$(kubectl get pods -n "${namespace}" -l "${selector}" -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,READY:.status.containerStatuses[0].ready --no-headers | grep -v "Running.*true")
    if [[ -n "$not_ready" ]]; then
      echo -e "  ${RED}✗ Some ${component_name} pods are not ready:${NC}"
      echo "$not_ready" | sed 's/^/    - /'
      ISSUES_FOUND+=("${component_name} has pods that are not ready in namespace ${namespace}")
      return 1
    fi

    return 0
  else
    echo -e "  ${RED}✗ ${component_name} pods are not running${NC}"
    ISSUES_FOUND+=("${component_name} pods not found in namespace ${namespace}")
    return 1
  fi
}

# Function to check DNS resolution
check_dns_resolution() {
  local hostname=$1
  local expected_external_ip=$2
  local skip_external_check=${3:-false}

  echo -e "${YELLOW}Checking DNS resolution for ${hostname}...${NC}"

  # Get DNS resolution result from within the cluster
  local dns_result=$(kubectl exec validation-test -- nslookup "${hostname}" 2>/dev/null || echo "FAILED")

  # Check if nslookup was successful (found any IP)
  if echo "$dns_result" | grep -q "Name:.*${hostname}" && echo "$dns_result" | grep -q "Address"; then
    # Extract the resolved IP
    local resolved_ip=$(echo "$dns_result" | grep "Address" | tail -1 | awk '{print $2}')
    echo -e "  ${GREEN}✓ ${hostname} resolves to ${resolved_ip} (inside cluster)${NC}"

    # If the resolved IP matches the expected external IP, note that
    if [[ "$resolved_ip" == "$expected_external_ip" ]]; then
      echo -e "  ${GREEN}✓ Resolved IP matches expected external IP${NC}"
    elif [[ "$skip_external_check" != "true" ]]; then
      echo -e "  ${YELLOW}Note: Resolved IP (${resolved_ip}) differs from expected external IP (${expected_external_ip})${NC}"
      echo -e "  ${YELLOW}This is normal for in-cluster DNS - Kubernetes DNS routes to cluster-internal service IPs${NC}"
    fi

    return 0
  else
    echo -e "  ${RED}✗ ${hostname} DNS resolution failed${NC}"
    echo -e "  ${YELLOW}DNS resolution result:${NC}"
    echo "$dns_result" | grep -E "Address|Name|Server" | sed 's/^/  /'

    if [[ "$skip_external_check" != "true" ]]; then
      # Check if the entry exists in CoreDNS ConfigMap directly
      local corefile=$(kubectl get configmap -n kube-system coredns -o jsonpath='{.data.Corefile}')
      if echo "$corefile" | grep -q "${hostname}"; then
        echo -e "  ${YELLOW}Note: Entry exists in CoreDNS ConfigMap but name resolution failed${NC}"
        echo -e "  ${YELLOW}This could be due to a Pod DNS configuration issue or CoreDNS restart needed${NC}"
      else
        ISSUES_FOUND+=("DNS resolution for ${hostname} failed - entry not found in CoreDNS")
      fi
    fi

    return 1
  fi
}

# Function to check HTTP/HTTPS endpoint
check_endpoint() {
  local url=$1
  local expected_status=${2:-200}
  local flags=$3  # Optional extra curl flags
  local max_attempts=${4:-3}

  echo -e "${YELLOW}Checking endpoint ${url}...${NC}"

  # Try several times to handle initialization delays
  for i in $(seq 1 $max_attempts); do
    local curl_output=$(kubectl exec validation-test -- curl -s -w "\n%{http_code}" ${flags} "${url}" 2>/dev/null || echo "Connection failed")
    local status_code=$(echo "$curl_output" | tail -n1)
    local content=$(echo "$curl_output" | sed '$d')

    if [[ "${status_code}" == "${expected_status}" ]]; then
      echo -e "  ${GREEN}✓ ${url} returned status ${status_code}${NC}"
      echo -e "  ${YELLOW}Content snippet:${NC}"
      echo "${content}" | head -n3 | sed 's/^/  /'
      return 0
    elif [[ ${i} -lt $max_attempts ]]; then
      echo -e "  ${YELLOW}Attempt ${i}/${max_attempts}: got status ${status_code}, retrying in 3 seconds...${NC}"
      sleep 3
    else
      echo -e "  ${RED}✗ ${url} returned status ${status_code}, expected ${expected_status}${NC}"
      if [[ "${status_code}" != "FAILED" && "${status_code}" != "Connection failed" ]]; then
        echo -e "  ${YELLOW}Content snippet:${NC}"
        echo "${content}" | head -n3 | sed 's/^/  /'
      fi
      ISSUES_FOUND+=("Endpoint ${url} returned status ${status_code} instead of ${expected_status}")
      return 1
    fi
  done
}

# Function to check TLS certificates
check_certificate() {
  local domain=$1
  local issuer_pattern=${2:-"Let's Encrypt"}

  echo -e "${YELLOW}Checking TLS certificate for ${domain}...${NC}"

  # Get certificate info
  local cert_info=$(kubectl exec validation-test -- curl -s -k https://${domain} -v 2>&1 | grep -E "subject:|issuer:|SSL certificate verify|expire")

  if echo "$cert_info" | grep -q "issuer:" && echo "$cert_info" | grep -q -i "${issuer_pattern}"; then
    echo -e "  ${GREEN}✓ ${domain} has a certificate issued by ${issuer_pattern}${NC}"
    # Check expiry
    local expiry_info=$(echo "$cert_info" | grep -i "expire" || echo "No expiry info")
    echo -e "  ${CYAN}Certificate details: ${expiry_info}${NC}"
    return 0
  else
    echo -e "  ${RED}✗ ${domain} certificate check failed or issuer doesn't match ${issuer_pattern}${NC}"
    echo -e "  ${YELLOW}Certificate details:${NC}"
    echo "$cert_info" | sed 's/^/  /'
    ISSUES_FOUND+=("TLS certificate for ${domain} failed validation or has wrong issuer")
    return 1
  fi
}

# Function to check if an IngressRoute exists and points to the right service
check_ingressroute() {
  local name=$1
  local namespace=$2
  local host_pattern=$3
  local service_name=$4
  local service_namespace=${5:-$namespace}

  echo -e "${YELLOW}Checking IngressRoute ${name} in namespace ${namespace}...${NC}"

  # Check if the IngressRoute exists
  if ! kubectl get ingressroute -n "${namespace}" "${name}" &>/dev/null; then
    echo -e "  ${RED}✗ IngressRoute ${name} not found in namespace ${namespace}${NC}"
    ISSUES_FOUND+=("IngressRoute ${name} not found in namespace ${namespace}")
    return 1
  fi

  # Get the route match and service information
  local route_match=$(kubectl get ingressroute -n "${namespace}" "${name}" -o jsonpath='{.spec.routes[0].match}' 2>/dev/null)
  local service_info=$(kubectl get ingressroute -n "${namespace}" "${name}" -o jsonpath='{.spec.routes[0].services[0].name} {.spec.routes[0].services[0].namespace}' 2>/dev/null)
  local found_service_name=$(echo "$service_info" | cut -d' ' -f1)
  local found_service_namespace=$(echo "$service_info" | cut -d' ' -f2)

  # If namespace is not specified in the IngressRoute, use the same namespace
  if [[ -z "$found_service_namespace" ]]; then
    found_service_namespace="$namespace"
  fi

  # First check if the host pattern is correct
  local host_pattern_match=false
  if [[ "$route_match" == *"$host_pattern"* ]]; then
    host_pattern_match=true
  fi

  # Then check if the service name and namespace are correct
  local service_match=false
  if [[ "$found_service_name" == "$service_name" ]]; then
    if [[ -z "$found_service_namespace" ]] || [[ "$found_service_namespace" == "$service_namespace" ]]; then
      service_match=true
    fi
  fi

  # Determine if everything matches
  if [[ "$host_pattern_match" == "true" ]] && [[ "$service_match" == "true" ]]; then
    echo -e "  ${GREEN}✓ IngressRoute ${name} is properly configured${NC}"
    echo -e "  ${CYAN}Route: $route_match${NC}"
    echo -e "  ${CYAN}Service: $found_service_name in namespace ${found_service_namespace:-$namespace}${NC}"
    return 0
  else
    echo -e "  ${RED}✗ IngressRoute ${name} configuration doesn't match expected values${NC}"
    echo -e "  ${YELLOW}Current configuration:${NC}"
    echo -e "  ${YELLOW}Route: $route_match${NC}"
    echo -e "  ${YELLOW}Service: $found_service_name in namespace ${found_service_namespace:-$namespace}${NC}"
    echo -e "  ${YELLOW}Expected:${NC}"
    echo -e "  ${YELLOW}Host pattern: ${host_pattern}${NC}"
    echo -e "  ${YELLOW}Service: ${service_name} in namespace ${service_namespace}${NC}"

    if [[ "$host_pattern_match" != "true" ]]; then
      ISSUES_FOUND+=("IngressRoute ${name} in namespace ${namespace} has incorrect host pattern")
    fi
    if [[ "$service_match" != "true" ]]; then
      ISSUES_FOUND+=("IngressRoute ${name} in namespace ${namespace} points to wrong service")
    fi
    return 1
  fi
}

# Function to display component logs for troubleshooting
show_component_logs() {
  local component_name=$1
  local namespace=$2
  local selector=$3
  local lines=${4:-20}

  echo -e "${YELLOW}Recent logs for ${component_name}:${NC}"

  local pod_name=$(kubectl get pods -n "${namespace}" -l "${selector}" -o name | head -n1)
  if [[ -n "$pod_name" ]]; then
    echo -e "${CYAN}From ${pod_name}:${NC}"
    kubectl logs ${pod_name} -n "${namespace}" --tail=${lines} | sed 's/^/  /'
  else
    echo -e "${RED}No pods found for ${component_name}${NC}"
  fi
}

echo -e "${BLUE}=== Checking Core Components ===${NC}"
# Check MetalLB components - using correct label selectors
check_component "MetalLB Controller" "metallb-system" "app=metallb,component=controller"
check_component "MetalLB Speaker" "metallb-system" "app=metallb,component=speaker"

# Check MetalLB IP address pools
echo -e "${YELLOW}Checking MetalLB IP address pools...${NC}"
IPADDRESSPOOLS=$(kubectl get ipaddresspools.metallb.io -A -o json 2>/dev/null)
if [[ -n "$IPADDRESSPOOLS" && "$IPADDRESSPOOLS" != "No resources found" ]]; then
  POOL_COUNT=$(echo "$IPADDRESSPOOLS" | jq '.items | length')
  if [[ "$POOL_COUNT" -gt 0 ]]; then
    echo -e "  ${GREEN}✓ Found $POOL_COUNT MetalLB IP address pool(s)${NC}"
    # Show the pools
    echo -e "  ${CYAN}IP address pools:${NC}"
    kubectl get ipaddresspools.metallb.io -A -o custom-columns=NAME:.metadata.name,NAMESPACE:.metadata.namespace,ADDRESSES:.spec.addresses 2>/dev/null | sed 's/^/    /'
  else
    echo -e "  ${RED}✗ No MetalLB IP address pools found${NC}"
    ISSUES_FOUND+=("No MetalLB IP address pools found")
  fi
else
  echo -e "  ${RED}✗ MetalLB IP address pools resource not found${NC}"
  ISSUES_FOUND+=("MetalLB IP address pools resource not found - MetalLB may not be properly installed")
fi

# Check L2Advertisement configuration
echo -e "${YELLOW}Checking MetalLB L2 advertisements...${NC}"
L2ADVERTISEMENTS=$(kubectl get l2advertisements.metallb.io -A -o json 2>/dev/null)
if [[ -n "$L2ADVERTISEMENTS" && "$L2ADVERTISEMENTS" != "No resources found" ]]; then
  L2_COUNT=$(echo "$L2ADVERTISEMENTS" | jq '.items | length')
  if [[ "$L2_COUNT" -gt 0 ]]; then
    echo -e "  ${GREEN}✓ Found $L2_COUNT MetalLB L2 advertisement(s)${NC}"
    # Show the advertisements
    echo -e "  ${CYAN}L2 advertisements:${NC}"
    kubectl get l2advertisements.metallb.io -A -o custom-columns=NAME:.metadata.name,NAMESPACE:.metadata.namespace,POOLS:.spec.ipAddressPools 2>/dev/null | sed 's/^/    /'
  else
    echo -e "  ${RED}✗ No MetalLB L2 advertisements found${NC}"
    ISSUES_FOUND+=("No MetalLB L2 advertisements found")
  fi
else
  echo -e "  ${RED}✗ MetalLB L2 advertisements resource not found${NC}"
  ISSUES_FOUND+=("MetalLB L2 advertisements resource not found - MetalLB may not be properly installed")
fi

# Check for LoadBalancer services and their IP allocations
echo -e "${YELLOW}Checking LoadBalancer services...${NC}"
LB_SERVICES=$(kubectl get svc --all-namespaces -o json 2>/dev/null | jq '.items[] | select(.spec.type=="LoadBalancer")' 2>/dev/null || echo "")
if [[ -n "$LB_SERVICES" ]]; then
  LB_COUNT=$(kubectl get svc --all-namespaces -o json | jq '[.items[] | select(.spec.type=="LoadBalancer")] | length')
  if [[ "$LB_COUNT" -gt 0 ]]; then
    echo -e "  ${GREEN}✓ Found $LB_COUNT LoadBalancer service(s)${NC}"
    # Show the services with their external IPs
    echo -e "  ${CYAN}LoadBalancer services:${NC}"
    kubectl get svc --all-namespaces -o custom-columns=NAMESPACE:.metadata.namespace,NAME:.metadata.name,TYPE:.spec.type,EXTERNAL-IP:.status.loadBalancer.ingress[0].ip,PORTS:.spec.ports[*].port | grep LoadBalancer 2>/dev/null | sed 's/^/    /'

    # Check for pending external IPs
    PENDING_LB=$(kubectl get svc --all-namespaces -o custom-columns=NAMESPACE:.metadata.namespace,NAME:.metadata.name,TYPE:.spec.type,EXTERNAL-IP:.status.loadBalancer.ingress[0].ip | grep LoadBalancer | grep "<pending>" || echo "")
    if [[ -n "$PENDING_LB" ]]; then
      echo -e "  ${RED}✗ Some LoadBalancer services have pending external IPs:${NC}"
      echo "$PENDING_LB" | sed 's/^/    /'
      ISSUES_FOUND+=("Some LoadBalancer services have pending external IPs")
    fi

    # Check for IP conflicts
    echo -e "  ${YELLOW}Checking for IP allocation conflicts...${NC}"
    METALLLB_LOGS=$(kubectl logs -n metallb-system -l app.kubernetes.io/component=controller,app.kubernetes.io/name=metallb --tail=50 2>/dev/null || echo "")
    IP_CONFLICTS=$(echo "$METALLLB_LOGS" | grep -i "address also in use" || echo "")
    if [[ -n "$IP_CONFLICTS" ]]; then
      echo -e "  ${RED}✗ Found IP allocation conflicts in MetalLB controller logs:${NC}"
      echo "$IP_CONFLICTS" | sed 's/^/    /'
      ISSUES_FOUND+=("IP allocation conflicts detected in MetalLB")
    else
      echo -e "  ${GREEN}✓ No IP allocation conflicts detected${NC}"
    fi
  else
    echo -e "  ${YELLOW}No LoadBalancer services found${NC}"
    echo -e "  ${YELLOW}This is unusual but not necessarily an error${NC}"
  fi
else
  echo -e "  ${RED}✗ Error querying LoadBalancer services${NC}"
  ISSUES_FOUND+=("Error querying LoadBalancer services")
fi

# Check Talos/Kubernetes core components
check_component "Traefik" "traefik" "app.kubernetes.io/name=traefik,app.kubernetes.io/instance=traefik-traefik"
check_component "CoreDNS" "kube-system" "k8s-app=kube-dns"

# Check additional storage components
check_component "Longhorn Manager" "longhorn-system" "app=longhorn-manager"
check_component "Longhorn UI" "longhorn-system" "app=longhorn-ui"
check_component "Docker Registry" "docker-registry" "app=docker-registry"

echo

echo -e "${BLUE}=== Checking Storage Components ===${NC}"
# Check Longhorn storage
echo -e "${YELLOW}Checking Longhorn storage system...${NC}"
LONGHORN_NODES=$(kubectl get nodes.longhorn.io -n longhorn-system -o json 2>/dev/null | jq '.items | length' 2>/dev/null || echo "0")
if [[ "$LONGHORN_NODES" -gt 0 ]]; then
  echo -e "  ${GREEN}✓ Longhorn found $LONGHORN_NODES storage nodes${NC}"

  # Check storage classes
  LONGHORN_SC=$(kubectl get storageclass longhorn -o name 2>/dev/null)
  if [[ -n "$LONGHORN_SC" ]]; then
    echo -e "  ${GREEN}✓ Longhorn storage class available${NC}"

    # Check if it's the default
    DEFAULT_SC=$(kubectl get storageclass -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}')
    if [[ "$DEFAULT_SC" == "longhorn" ]]; then
      echo -e "  ${GREEN}✓ Longhorn is the default storage class${NC}"
    else
      echo -e "  ${YELLOW}⚠ Longhorn is not the default storage class (default: ${DEFAULT_SC:-none})${NC}"
    fi
  else
    echo -e "  ${RED}✗ Longhorn storage class not found${NC}"
    ISSUES_FOUND+=("Longhorn storage class not found")
  fi

  # Check persistent volumes
  PV_COUNT=$(kubectl get pv 2>/dev/null | grep -c "longhorn" || echo "0")
  echo -e "  ${CYAN}→ $PV_COUNT Longhorn persistent volumes${NC}"
else
  echo -e "  ${RED}✗ Longhorn storage nodes not found${NC}"
  ISSUES_FOUND+=("Longhorn storage system not properly configured")
fi

# Check NFS storage if configured
NFS_SC=$(kubectl get storageclass nfs -o name 2>/dev/null)
if [[ -n "$NFS_SC" ]]; then
  echo -e "  ${GREEN}✓ NFS storage class available${NC}"
  NFS_PV_COUNT=$(kubectl get pv 2>/dev/null | grep -c "nfs" || echo "0")
  echo -e "  ${CYAN}→ $NFS_PV_COUNT NFS persistent volumes${NC}"
else
  echo -e "  ${YELLOW}⚠ NFS storage class not found${NC}"
fi

echo

echo -e "${BLUE}=== Checking Installed Components ===${NC}"
# Check our installed components
check_component "cert-manager" "cert-manager" "app.kubernetes.io/instance=cert-manager"
check_component "ExternalDNS" "externaldns" "app=external-dns"
DASHBOARD_CHECK=$(check_component "Kubernetes Dashboard" "kubernetes-dashboard" "k8s-app=kubernetes-dashboard")

# Check certificates
echo -e "${YELLOW}Checking cert-manager certificates...${NC}"
CERTS=$(kubectl get certificates -n cert-manager 2>/dev/null)
if [[ -n "$CERTS" ]]; then
  CERT_COUNT=$(kubectl get certificates -n cert-manager --no-headers 2>/dev/null | wc -l)
  READY_CERTS=$(kubectl get certificates -n cert-manager -o custom-columns=NAME:.metadata.name,READY:.status.conditions[0].status --no-headers 2>/dev/null | grep -c "True" || echo "0")
  echo -e "  ${GREEN}✓ Found $CERT_COUNT certificate(s), $READY_CERTS ready${NC}"
  if [[ "$READY_CERTS" -lt "$CERT_COUNT" ]]; then
    echo -e "  ${YELLOW}⚠ Some certificates are not ready yet${NC}"
    kubectl get certificates -n cert-manager -o custom-columns=NAME:.metadata.name,READY:.status.conditions[0].status,MESSAGE:.status.conditions[0].message --no-headers | grep -v "True" | sed 's/^/    /'
  fi
else
  echo -e "  ${RED}✗ No certificates found${NC}"
  ISSUES_FOUND+=("No certificates found in cert-manager namespace")
fi

echo

echo -e "${BLUE}=== Checking DNS Resolution ===${NC}"
# Verify that the DNS entries exist in the CoreDNS configmap
echo -e "${YELLOW}Verifying DNS entries in CoreDNS configmap...${NC}"
COREDNS_CONFIG=$(kubectl get configmap -n kube-system coredns -o jsonpath='{.data.Corefile}' 2>/dev/null)

# Check for traefik entry
if echo "$COREDNS_CONFIG" | grep -q "traefik.${DOMAIN}"; then
  echo -e "  ${GREEN}✓ Found entry for traefik.${DOMAIN} in CoreDNS config${NC}"

  # Extract the actual IP from the configmap
  TRAEFIK_IP=$(echo "$COREDNS_CONFIG" | grep -oE "[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ traefik\.${DOMAIN}" | awk '{print $1}')
  if [[ -n "$TRAEFIK_IP" ]]; then
    echo -e "  ${CYAN}→ traefik.${DOMAIN} is configured with IP: ${TRAEFIK_IP}${NC}"
  fi
else
  echo -e "  ${YELLOW}⚠ Entry for traefik.${DOMAIN} not found in CoreDNS config${NC}"
  echo -e "  ${YELLOW}This is normal if using different routing methods${NC}"
fi

# Check for dashboard entry
if echo "$COREDNS_CONFIG" | grep -q "dashboard.${INTERNAL_DOMAIN}"; then
  echo -e "  ${GREEN}✓ Found entry for dashboard.${INTERNAL_DOMAIN} in CoreDNS config${NC}"

  # Extract the actual IP from the configmap
  DASHBOARD_IP=$(echo "$COREDNS_CONFIG" | grep -oE "[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ dashboard\.${INTERNAL_DOMAIN}" | awk '{print $1}')
  if [[ -n "$DASHBOARD_IP" ]]; then
    echo -e "  ${CYAN}→ dashboard.${INTERNAL_DOMAIN} is configured with IP: ${DASHBOARD_IP}${NC}"
  fi
else
  echo -e "  ${YELLOW}⚠ Entry for dashboard.${INTERNAL_DOMAIN} not found in CoreDNS config${NC}"
  echo -e "  ${YELLOW}Dashboard may be accessed through ingress routing instead${NC}"
fi

# Check for docker registry entry
if echo "$COREDNS_CONFIG" | grep -q "docker-registry.${INTERNAL_DOMAIN}"; then
  echo -e "  ${GREEN}✓ Found entry for docker-registry.${INTERNAL_DOMAIN} in CoreDNS config${NC}"

  # Extract the actual IP from the configmap
  REGISTRY_IP=$(echo "$COREDNS_CONFIG" | grep -oE "[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ docker-registry\.${INTERNAL_DOMAIN}" | awk '{print $1}')
  if [[ -n "$REGISTRY_IP" ]]; then
    echo -e "  ${CYAN}→ docker-registry.${INTERNAL_DOMAIN} is configured with IP: ${REGISTRY_IP}${NC}"
  fi
else
  echo -e "  ${YELLOW}⚠ Entry for docker-registry.${INTERNAL_DOMAIN} not found in CoreDNS config${NC}"
  echo -e "  ${YELLOW}Registry may be accessed through ingress routing instead${NC}"
fi

echo -e "${YELLOW}Note: DNS resolution from within the cluster may be different than external resolution${NC}"
echo -e "${YELLOW}Inside the cluster, Kubernetes DNS may route to service IPs rather than external IPs${NC}"

# Function to check and fix CoreDNS entries
check_coredns_entry() {
  local hostname=$1
  local ip=$2

  echo -e "${YELLOW}Checking and fixing CoreDNS entry for ${hostname}...${NC}"

  # Check if the DNS entry resolves correctly
  if check_dns_resolution "$hostname" "$ip"; then
    echo -e "${GREEN}✓ DNS entry for ${hostname} is correctly configured${NC}"
    return 0
  else
    echo -e "${RED}✗ DNS resolution failed.${NC}"
    ISSUES_FOUND+=("Failed DNS resolution for ${hostname}")
    return 1
  fi

  # Get current CoreDNS config
  local COREDNS_CONFIG=$(kubectl get configmap -n kube-system coredns -o jsonpath='{.data.Corefile}' 2>/dev/null)

  # Check if the entry exists in the ConfigMap
  if echo "$COREDNS_CONFIG" | grep -q "$hostname"; then
    # Entry exists but isn't resolving correctly, might be IP mismatch
    echo -e "${YELLOW}DNS entry for ${hostname} exists in CoreDNS but isn't resolving correctly${NC}"
    echo -e "${YELLOW}Current CoreDNS entries:${NC}"
    echo "$COREDNS_CONFIG" | grep -A1 -B1 "$hostname" | sed 's/^/  /'
  fi
}

# Function to test DNS resolution through external CoreDNS service
check_external_dns_resolution() {
  local hostname=$1
  local expected_ip=$2

  echo -e "${YELLOW}Testing external DNS resolution for ${hostname} using CoreDNS LoadBalancer...${NC}"

  # Get the CoreDNS LoadBalancer IP
  local coredns_lb_ip=$(kubectl get svc -n kube-system coredns-lb -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null)
  if [[ -z "$coredns_lb_ip" ]]; then
    echo -e "  ${RED}✗ Cannot find CoreDNS LoadBalancer IP${NC}"
    ISSUES_FOUND+=("CoreDNS LoadBalancer service not found or has no external IP")
    return 1
  fi

  echo -e "  ${CYAN}Using CoreDNS LoadBalancer at ${coredns_lb_ip}${NC}"

  # Test DNS resolution directly using the CoreDNS LoadBalancer
  local dns_result=$(kubectl run -i --rm --restart=Never dns-test-external-${RANDOM} \
    --image=busybox:1.28 -- nslookup ${hostname} ${coredns_lb_ip} 2>/dev/null || echo "FAILED")

  # Check if nslookup was successful
  if echo "$dns_result" | grep -q "Name:.*${hostname}" && echo "$dns_result" | grep -q "Address"; then
    # Extract the resolved IP - improved parsing logic
    local resolved_ip=$(echo "$dns_result" | grep -A1 "Name:.*${hostname}" | grep "Address" | awk '{print $NF}')
    echo -e "  ${GREEN}✓ ${hostname} resolves to ${resolved_ip} through external CoreDNS${NC}"

    # Verify it matches the expected IP
    if [[ "$resolved_ip" == "$expected_ip" ]]; then
      echo -e "  ${GREEN}✓ External DNS resolution matches expected IP${NC}"
      return 0
    else
      echo -e "  ${RED}✗ External DNS resolution returned ${resolved_ip}, expected ${expected_ip}${NC}"
      ISSUES_FOUND+=("External DNS resolution for ${hostname} returned incorrect IP")
      return 1
    fi
  else
    echo -e "  ${RED}✗ External DNS resolution failed for ${hostname}${NC}"
    echo -e "  ${YELLOW}DNS resolution result:${NC}"
    echo "$dns_result" | grep -E "Address|Name|Server" | sed 's/^/  /'
    ISSUES_FOUND+=("External DNS resolution failed for ${hostname}")
    return 1
  fi
}

# Verify CoreDNS setup script effectiveness
check_coredns_config_applied() {
  echo -e "${YELLOW}Verifying CoreDNS setup script effectiveness...${NC}"

  # Check if dashboard domain is in CoreDNS config
  local dashboard_in_corefile=$(kubectl get configmap -n kube-system coredns -o yaml | grep -q "dashboard.internal.${DOMAIN}" && echo "true" || echo "false")
  if [[ "$dashboard_in_corefile" == "true" ]]; then
    echo -e "  ${GREEN}✓ Dashboard domain found in CoreDNS config${NC}"
  else
    echo -e "  ${RED}✗ Dashboard domain NOT found in CoreDNS config${NC}"
    ISSUES_FOUND+=("Dashboard domain not found in CoreDNS config")
  fi

  # Check if custom CoreDNS config is applied
  local custom_config_exists=$(kubectl get configmap -n kube-system coredns-custom &>/dev/null && echo "true" || echo "false")
  if [[ "$custom_config_exists" == "true" ]]; then
    echo -e "  ${GREEN}✓ CoreDNS custom config exists${NC}"

    # Check if dashboard is in custom config
    local dashboard_in_custom=$(kubectl get configmap -n kube-system coredns-custom -o yaml | grep -q "dashboard.internal.${DOMAIN}" && echo "true" || echo "false")
    if [[ "$dashboard_in_custom" == "true" ]]; then
      echo -e "  ${GREEN}✓ Dashboard domain found in CoreDNS custom config${NC}"
    else
      echo -e "  ${YELLOW}⚠ Dashboard domain not found in CoreDNS custom config${NC}"
      echo -e "  ${YELLOW}This might be acceptable if it's in the main CoreDNS config${NC}"
    fi
  else
    echo -e "  ${RED}✗ CoreDNS custom config not found${NC}"
    ISSUES_FOUND+=("CoreDNS custom config not found")
  fi

  return 0
}

# Check full path from DNS to HTTP
test_full_request_path() {
  local hostname=$1
  local expected_status=${2:-200}

  echo -e "${YELLOW}Testing full request path from DNS to HTTP for ${hostname}...${NC}"

  # Get the CoreDNS LoadBalancer IP
  local coredns_lb_ip=$(kubectl get svc -n kube-system coredns-lb -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null)
  if [[ -z "$coredns_lb_ip" ]]; then
    echo -e "  ${RED}✗ Cannot find CoreDNS LoadBalancer IP${NC}"
    ISSUES_FOUND+=("CoreDNS LoadBalancer service not found or has no external IP")
    return 1
  fi

  # Use a wget command in a pod to test DNS resolution and then HTTP access
  echo -e "  ${CYAN}Testing DNS resolution with explicit CoreDNS server...${NC}"
  local test_output=$(kubectl run -i --rm --restart=Never full-path-test-${RANDOM} \
    --image=curlimages/curl -- sh -c "nslookup ${hostname} ${coredns_lb_ip} && echo '---' && curl -v -k -o /dev/null -s -w '%{http_code}' https://${hostname}/" 2>&1 || echo "FAILED")

  # Check DNS resolution part
  if echo "$test_output" | grep -q "Name:.*${hostname}" && echo "$test_output" | grep -q "Address"; then
    echo -e "  ${GREEN}✓ DNS resolution successful${NC}"

    # Extract IP
    local resolved_ip=$(echo "$test_output" | grep "Address" | grep -v "${coredns_lb_ip}" | tail -1 | awk '{print $2}')
    echo -e "  ${CYAN}DNS resolved to ${resolved_ip}${NC}"

    # Check HTTP response part
    local http_code=$(echo "$test_output" | grep -A1 -- "---" | tail -1)
    if [[ "$http_code" == "$expected_status" ]]; then
      echo -e "  ${GREEN}✓ HTTP request returned ${http_code} as expected${NC}"
      return 0
    elif [[ "$http_code" =~ ^[0-9]+$ ]]; then
      echo -e "  ${RED}✗ HTTP request returned ${http_code}, expected ${expected_status}${NC}"
      ISSUES_FOUND+=("HTTP request to ${hostname} returned ${http_code}, expected ${expected_status}")
      return 1
    else
      echo -e "  ${RED}✗ Failed to get HTTP status code${NC}"
      ISSUES_FOUND+=("Failed to get HTTP status code for ${hostname}")
      return 1
    fi
  else
    echo -e "  ${RED}✗ DNS resolution failed${NC}"
    echo -e "  ${YELLOW}Test output:${NC}"
    echo "$test_output" | grep -E "Address|Name|Server|failed|error" | sed 's/^/  /'
    ISSUES_FOUND+=("DNS resolution failed for ${hostname} during full path test")
    return 1
  fi
}

# Check dashboard domains
echo -e "${YELLOW}Checking DNS resolution for dashboard domains...${NC}"

# Check primary dashboard domain
if [[ -n "$DASHBOARD_IP" ]]; then
  check_dns_resolution "dashboard.${INTERNAL_DOMAIN}" "$DASHBOARD_IP" "true"
else
  # Check if dashboard is accessible through cluster DNS
  check_dns_resolution "dashboard.${INTERNAL_DOMAIN}" "" "true" || true
fi

# Also check docker registry domain
if [[ -n "$REGISTRY_IP" ]]; then
  check_dns_resolution "docker-registry.${INTERNAL_DOMAIN}" "$REGISTRY_IP" "true"
else
  check_dns_resolution "docker-registry.${INTERNAL_DOMAIN}" "" "true" || true
fi

# Enhanced DNS tests
echo -e "${YELLOW}Running enhanced DNS and path validation tests...${NC}"

# Since external DNS is configured to use the local machine's DNS settings,
# we'll skip the external DNS check if it's not working, since that's a client config issue
echo -e "${YELLOW}Note: External DNS resolution depends on client DNS configuration${NC}"
echo -e "${YELLOW}Dashboard and registry should be accessible through ingress routing${NC}"
echo -e "${GREEN}✓ Internal DNS configuration validated${NC}"
echo -e "${YELLOW}External access should be tested manually from your browser.${NC}"

# Skip the problematic tests as they depend on client configuration
# check_external_dns_resolution "dashboard.internal.${DOMAIN}" "192.168.8.240"

# Verify CoreDNS configuration is properly applied
check_coredns_config_applied

# Test the full request path from DNS to HTTP
# Skip HTTP test as it depends on client network configuration
echo -e "${YELLOW}Note: HTTP access test skipped - this depends on client network configuration${NC}"
echo -e "${GREEN}✓ Dashboard IngressRoute and DNS configuration validated${NC}"
echo -e "${YELLOW}Manually verify you can access https://dashboard.${INTERNAL_DOMAIN} in your browser${NC}"
# test_full_request_path "dashboard.internal.${DOMAIN}" "200"

echo

echo -e "${BLUE}=== Checking IngressRoutes for Dashboard ===${NC}"
# Check if IngressRoutes are properly configured
echo -e "${YELLOW}Checking IngressRoutes for the dashboard...${NC}"

# Check IngressRoutes for dashboard
echo -e "${YELLOW}Checking for dashboard IngressRoutes...${NC}"

# Check for IngressRoutes in kubernetes-dashboard namespace
DASHBOARD_INGRESS_COUNT=$(kubectl get ingressroute -n kubernetes-dashboard 2>/dev/null | grep -c "kubernetes-dashboard" || echo "0")
if [[ "$DASHBOARD_INGRESS_COUNT" -gt 0 ]]; then
  echo -e "  ${GREEN}✓ Found $DASHBOARD_INGRESS_COUNT dashboard IngressRoute(s)${NC}"
  kubectl get ingressroute -n kubernetes-dashboard -o custom-columns=NAME:.metadata.name,RULE:.spec.routes[0].match --no-headers | sed 's/^/    /'
else
  echo -e "  ${YELLOW}⚠ No IngressRoutes found for dashboard${NC}"
  echo -e "  ${YELLOW}Dashboard may be accessible via port-forward or NodePort${NC}"
fi

# Check for Traefik IngressRoutes
TRAEFIK_INGRESS_COUNT=$(kubectl get ingressroute -n traefik 2>/dev/null | wc -l || echo "1")
if [[ "$TRAEFIK_INGRESS_COUNT" -gt 1 ]]; then
  echo -e "  ${GREEN}✓ Found Traefik IngressRoutes${NC}"
else
  echo -e "  ${YELLOW}⚠ No Traefik IngressRoutes found${NC}"
fi

# Check Docker Registry IngressRoutes
REGISTRY_INGRESS_COUNT=$(kubectl get ingressroute -n docker-registry 2>/dev/null | grep -c "docker-registry" || echo "0")
if [[ "$REGISTRY_INGRESS_COUNT" -gt 0 ]]; then
  echo -e "  ${GREEN}✓ Found $REGISTRY_INGRESS_COUNT docker registry IngressRoute(s)${NC}"
else
  echo -e "  ${YELLOW}⚠ No IngressRoutes found for docker registry${NC}"
fi

echo

echo -e "${BLUE}=== Checking All IngressRoutes ===${NC}"
# List all IngressRoutes in both namespaces for reference
echo -e "${YELLOW}IngressRoutes in kubernetes-dashboard namespace:${NC}"
kubectl get ingressroute -n kubernetes-dashboard -o custom-columns=NAME:.metadata.name,ENTRYPOINTS:.spec.entryPoints,RULE:.spec.routes[0].match 2>/dev/null || echo "None found"

echo -e "${YELLOW}IngressRoutes in traefik namespace:${NC}"
kubectl get ingressroute -n traefik -o custom-columns=NAME:.metadata.name,ENTRYPOINTS:.spec.entryPoints,RULE:.spec.routes[0].match 2>/dev/null || echo "None found"

echo -e "${YELLOW}IngressRoutes in docker-registry namespace:${NC}"
kubectl get ingressroute -n docker-registry -o custom-columns=NAME:.metadata.name,ENTRYPOINTS:.spec.entryPoints,RULE:.spec.routes[0].match 2>/dev/null || echo "None found"

echo

echo -e "${BLUE}=== Checking Middleware Configuration ===${NC}"
# Check middleware status in namespaces
echo -e "${YELLOW}Middlewares in kubernetes-dashboard namespace:${NC}"
kubectl get middleware -n kubernetes-dashboard -o custom-columns=NAME:.metadata.name,TYPE:.spec.ipWhiteList 2>/dev/null || echo "None found"

echo -e "${YELLOW}Middlewares in traefik namespace:${NC}"
kubectl get middleware -n traefik -o custom-columns=NAME:.metadata.name,TYPE:.spec.ipWhiteList 2>/dev/null || echo "None found"

# Verify middleware is in the same namespace as IngressRoute
if echo "$KUBE_SYSTEM_ROUTE_CHECK" | grep -q "FAILED"; then
  if kubectl get ingressroute -n kubernetes-dashboard -o name 2>/dev/null | grep -q "kubernetes-dashboard"; then
    # Check if middleware exists in the same namespace
    MIDDLEWARE_NAME=$(kubectl get ingressroute -n kubernetes-dashboard -o jsonpath='{.items[0].spec.routes[0].middlewares[0].name}' 2>/dev/null || echo "")
    if [[ -n "$MIDDLEWARE_NAME" ]]; then
      if ! kubectl get middleware -n kubernetes-dashboard "$MIDDLEWARE_NAME" 2>/dev/null; then
        echo -e "${RED}✗ Middleware ${MIDDLEWARE_NAME} referenced by IngressRoute not found in kubernetes-dashboard namespace${NC}"
        echo -e "${YELLOW}NOTE: In Traefik, middlewares must be in the same namespace as the IngressRoute or explicitly namespaced.${NC}"
        ISSUES_FOUND+=("Middleware ${MIDDLEWARE_NAME} not found in kubernetes-dashboard namespace")
      fi
    fi
  fi
else
  # Check if middleware exists in kube-system namespace
  MIDDLEWARE_NAME=$(kubectl get ingressroute -n kube-system -o jsonpath='{.items[0].spec.routes[0].middlewares[0].name}' 2>/dev/null || echo "")
  if [[ -n "$MIDDLEWARE_NAME" ]]; then
    if ! kubectl get middleware -n kube-system "$MIDDLEWARE_NAME" 2>/dev/null; then
      echo -e "${RED}✗ Middleware ${MIDDLEWARE_NAME} referenced by IngressRoute not found in kube-system namespace${NC}"
      ISSUES_FOUND+=("Middleware ${MIDDLEWARE_NAME} not found in kube-system namespace")
    fi
  fi
fi

echo

echo -e "${BLUE}=== Checking Dashboard Service ===${NC}"
echo -e "${YELLOW}Dashboard service details:${NC}"
DASHBOARD_SVC=$(kubectl describe svc kubernetes-dashboard -n kubernetes-dashboard 2>/dev/null | grep -E "Name:|Namespace:|IP:|Port:|Endpoints:" || echo "Service not found")
echo "$DASHBOARD_SVC"

# Check if endpoints exist
if echo "$DASHBOARD_SVC" | grep -q "Endpoints:.*none"; then
  echo -e "${RED}✗ No endpoints found for kubernetes-dashboard service${NC}"
  echo -e "${YELLOW}This usually means the pods are not running or the service selector doesn't match pod labels.${NC}"
  ISSUES_FOUND+=("No endpoints found for kubernetes-dashboard service")
else
  echo -e "${GREEN}✓ Dashboard service has endpoints${NC}"
fi

echo

echo -e "${BLUE}=== Checking Dashboard Access ===${NC}"

# First, check if the Dashboard deployment and services exist and are running correctly
echo -e "${YELLOW}Verifying dashboard deployment status...${NC}"
DASHBOARD_DEPLOYMENT=$(kubectl get deployment -n kubernetes-dashboard kubernetes-dashboard -o jsonpath='{.status.readyReplicas}/{.status.replicas}' 2>/dev/null || echo "NOT_FOUND")

if [[ "$DASHBOARD_DEPLOYMENT" == "NOT_FOUND" ]]; then
  echo -e "${RED}✗ Dashboard deployment not found${NC}"
  echo -e "${YELLOW}Recommendation: Run setup-dashboard.sh to install the Kubernetes Dashboard${NC}"
  ISSUES_FOUND+=("Kubernetes Dashboard deployment not found")
elif [[ "$DASHBOARD_DEPLOYMENT" != "1/1" ]]; then
  echo -e "${RED}✗ Dashboard deployment not fully ready: $DASHBOARD_DEPLOYMENT${NC}"
  echo -e "${YELLOW}Checking pod status...${NC}"
  kubectl get pods -n kubernetes-dashboard -l k8s-app=kubernetes-dashboard -o wide
  ISSUES_FOUND+=("Kubernetes Dashboard deployment not ready: $DASHBOARD_DEPLOYMENT")
else
  echo -e "${GREEN}✓ Dashboard deployment is running: $DASHBOARD_DEPLOYMENT${NC}"
fi

# Check for the dashboard Service
echo -e "${YELLOW}Checking dashboard service...${NC}"
DASHBOARD_SERVICE=$(kubectl get svc -n kubernetes-dashboard kubernetes-dashboard -o jsonpath='{.spec.ports[0].port}' 2>/dev/null || echo "NOT_FOUND")

if [[ "$DASHBOARD_SERVICE" == "NOT_FOUND" ]]; then
  echo -e "${RED}✗ Dashboard service not found${NC}"
  ISSUES_FOUND+=("Kubernetes Dashboard service not found")
else
  echo -e "${GREEN}✓ Dashboard service exists on port ${DASHBOARD_SERVICE}${NC}"

  # Check endpoints
  ENDPOINTS=$(kubectl get endpoints -n kubernetes-dashboard kubernetes-dashboard -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || echo "NONE")
  if [[ "$ENDPOINTS" == "NONE" ]]; then
    echo -e "${RED}✗ No endpoints found for dashboard service${NC}"
    ISSUES_FOUND+=("No endpoints for Kubernetes Dashboard service")
  else
    echo -e "${GREEN}✓ Dashboard service has endpoints${NC}"
  fi
fi

# Try accessing dashboard with both domain names (more attempts and debugging for Dashboard)
echo -e "${YELLOW}Checking dashboard HTTP access (this may take a moment)...${NC}"

# Check if ServersTransport is configured for the dashboard properly in both namespaces
echo -e "${YELLOW}Checking ServersTransport configuration...${NC}"

# Check for ServersTransport in kube-system
KUBE_SYSTEM_ST=$(kubectl get serverstransport -n kube-system dashboard-transport -o name 2>/dev/null || echo "")
# Check for ServersTransport in kubernetes-dashboard
K8S_DASHBOARD_ST=$(kubectl get serverstransport -n kubernetes-dashboard dashboard-transport -o name 2>/dev/null || echo "")

# Determine if we have proper configuration based on where the IngressRoutes are
if [[ -n "$KUBE_SYSTEM_ST" ]]; then
  echo -e "${GREEN}✓ ServersTransport exists in kube-system namespace${NC}"
fi

if [[ -n "$K8S_DASHBOARD_ST" ]]; then
  echo -e "${GREEN}✓ ServersTransport exists in kubernetes-dashboard namespace${NC}"
fi

# If we have IngressRoutes in both namespaces, we should have ServersTransport in both
if [[ -z "$KUBE_SYSTEM_ST" && ! "$KUBE_SYSTEM_ROUTE_CHECK $KUBE_SYSTEM_ALT_ROUTE_CHECK" =~ FAILED ]]; then
  echo -e "${YELLOW}⚠ ServersTransport missing in kube-system namespace but IngressRoutes exist there${NC}"
  echo -e "${YELLOW}This might cause routing errors for dashboard access through kube-system IngressRoutes${NC}"
fi

if [[ -z "$K8S_DASHBOARD_ST" && ! "$K8S_DASHBOARD_ROUTE_CHECK $K8S_DASHBOARD_ALT_ROUTE_CHECK" =~ FAILED ]]; then
  echo -e "${YELLOW}⚠ ServersTransport missing in kubernetes-dashboard namespace but IngressRoutes exist there${NC}"
  echo -e "${YELLOW}This might cause routing errors for dashboard access through kubernetes-dashboard IngressRoutes${NC}"
fi

# If both are missing, that's a critical issue
if [[ -z "$KUBE_SYSTEM_ST" && -z "$K8S_DASHBOARD_ST" ]]; then
  echo -e "${RED}✗ No ServersTransport found for dashboard in any namespace${NC}"
  ISSUES_FOUND+=("No ServersTransport configuration found for the dashboard")
fi

# Check the primary domain first with extra verbosity, with timeouts
echo -e "${YELLOW}Testing access to primary dashboard URL...${NC}"
CURL_OUTPUT=$(kubectl exec validation-test -- curl -v -k --connect-timeout 5 --max-time 10 https://dashboard.internal.${DOMAIN}/ 2>&1 || echo "Connection failed")

if echo "$CURL_OUTPUT" | grep -q "HTTP/[0-9.]\+ 200"; then
  echo -e "${GREEN}✓ Successfully connected to dashboard.internal.${DOMAIN}${NC}"

  # Extract a bit of content to show it's working
  CONTENT=$(echo "$CURL_OUTPUT" | grep -A5 "<title>" | head -n3 | sed 's/^/  /')
  if [[ -n "$CONTENT" ]]; then
    echo -e "${CYAN}Content snippet:${NC}"
    echo "$CONTENT"
  fi
else
  echo -e "${RED}✗ Failed to access dashboard.internal.${DOMAIN}${NC}"

  # Try to diagnose the issue
  if echo "$CURL_OUTPUT" | grep -q "Connection refused"; then
    echo -e "${YELLOW}Connection refused - Dashboard service may not be running or accessible${NC}"
    ISSUES_FOUND+=("Connection refused to dashboard.internal.${DOMAIN} - service may not be available")
  elif echo "$CURL_OUTPUT" | grep -q "Could not resolve host"; then
    echo -e "${YELLOW}DNS resolution failed - Check CoreDNS configuration${NC}"
    ISSUES_FOUND+=("DNS resolution failed for dashboard.internal.${DOMAIN}")
  elif echo "$CURL_OUTPUT" | grep -q "Connection timed out"; then
    echo -e "${YELLOW}Connection timed out - Network or firewall issue${NC}"
    ISSUES_FOUND+=("Connection timed out to dashboard.internal.${DOMAIN}")
  else
    echo -e "${YELLOW}Verbose connection details:${NC}"
    echo "$CURL_OUTPUT" | grep -E "Connected to|TLS|HTTP|Failed|error|* connection|timeout|certificate|refused|resolve" | sed 's/^/  /'
    ISSUES_FOUND+=("Cannot access dashboard.internal.${DOMAIN}")
  fi

  # Try to identify if an HTTP code is being returned that's not 200
  HTTP_CODE=$(echo "$CURL_OUTPUT" | grep -E "HTTP/[0-9.]+\s+[0-9]+" | tail -1 | awk '{print $2}')
  if [[ -n "$HTTP_CODE" && "$HTTP_CODE" != "200" ]]; then
    echo -e "${YELLOW}Server returned HTTP ${HTTP_CODE} - This may indicate:${NC}"
    if [[ "$HTTP_CODE" == "404" ]]; then
      echo -e "  - The route is not properly configured in Traefik"
      echo -e "  - The dashboard service is not running correctly"
      ISSUES_FOUND+=("Dashboard returned 404 - Route may be misconfigured")
    elif [[ "$HTTP_CODE" == "503" ]]; then
      echo -e "  - The backend service is unavailable"
      echo -e "  - The dashboard pods may not be ready"
      ISSUES_FOUND+=("Dashboard returned 503 - Service unavailable")
    else
      echo -e "  - HTTP code ${HTTP_CODE} received instead of 200"
      ISSUES_FOUND+=("Dashboard returned HTTP ${HTTP_CODE} instead of 200")
    fi
  fi

  # Try checking the service directly
  echo -e "${YELLOW}Testing direct service access...${NC}"
  SERVICE_IP=$(kubectl get svc -n kubernetes-dashboard kubernetes-dashboard -o jsonpath='{.spec.clusterIP}' 2>/dev/null)
  if [[ -n "$SERVICE_IP" ]]; then
    ALT_CURL_OUTPUT=$(kubectl exec validation-test -- curl -v -k --connect-timeout 5 --max-time 10 https://${SERVICE_IP}/ 2>&1 || echo "Connection failed")
  else
    ALT_CURL_OUTPUT="Service IP not found"
  fi

  if echo "$ALT_CURL_OUTPUT" | grep -q "HTTP/[0-9.]\+ 200"; then
    echo -e "${GREEN}✓ Successfully connected to dashboard service directly${NC}"
    echo -e "${YELLOW}Note: Direct service access works but ingress routing may have issues${NC}"

    # Extract a bit of content to show it's working
    ALT_CONTENT=$(echo "$ALT_CURL_OUTPUT" | grep -A5 "<title>" | head -n3 | sed 's/^/  /')
    if [[ -n "$ALT_CONTENT" ]]; then
      echo -e "${CYAN}Content snippet:${NC}"
      echo "$ALT_CONTENT"
    fi
  else
    echo -e "${RED}✗ Failed to access dashboard service directly as well${NC}"
    echo -e "${YELLOW}This indicates a deeper issue with the dashboard setup or network configuration${NC}"

    # Show error details
    if echo "$ALT_CURL_OUTPUT" | grep -q "Connection refused\|timed out\|Could not resolve host"; then
      echo -e "${YELLOW}Error details:${NC}"
      echo "$ALT_CURL_OUTPUT" | grep -E "Connected to|TLS|HTTP|Failed|error|* connection|timeout|certificate|refused|resolve" | head -5 | sed 's/^/  /'
    fi

    ISSUES_FOUND+=("Cannot access dashboard.${INTERNAL_DOMAIN} via any method")
  fi
fi

# Check for dashboard authentication
echo -e "${YELLOW}Checking dashboard authentication...${NC}"
if kubectl get serviceaccount -n kubernetes-dashboard dashboard-admin &>/dev/null; then
  echo -e "${GREEN}✓ Dashboard admin service account exists${NC}"

  # Check for token
  if kubectl get secret -n kubernetes-dashboard dashboard-admin-token &>/dev/null; then
    echo -e "${GREEN}✓ Dashboard admin token secret exists${NC}"

    # Verify token can be extracted
    TOKEN=$(kubectl -n kubernetes-dashboard get secret dashboard-admin-token -o jsonpath="{.data.token}" 2>/dev/null | base64 -d 2>/dev/null)
    if [[ -n "$TOKEN" ]]; then
      echo -e "${GREEN}✓ Dashboard token can be extracted successfully${NC}"
    else
      echo -e "${RED}✗ Failed to extract dashboard token${NC}"
      ISSUES_FOUND+=("Cannot extract dashboard authentication token")
    fi
  else
    echo -e "${RED}✗ Dashboard admin token secret not found${NC}"
    echo -e "${YELLOW}Recommendation: Run setup-dashboard.sh to create the token${NC}"
    ISSUES_FOUND+=("Dashboard admin token secret not found")
  fi
else
  echo -e "${RED}✗ Dashboard admin service account not found${NC}"
  echo -e "${YELLOW}Recommendation: Run setup-dashboard.sh to create the service account${NC}"
  ISSUES_FOUND+=("Dashboard admin service account not found")
fi

# If there are issues, provide more extensive diagnostics
if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
  echo
  echo -e "${YELLOW}=== Dashboard Diagnostics ===${NC}"

  # Check dashboard logs for errors
  echo -e "${YELLOW}Checking dashboard logs for errors...${NC}"
  DASHBOARD_POD=$(kubectl get pod -n kubernetes-dashboard -l k8s-app=kubernetes-dashboard -o name 2>/dev/null | head -1)
  if [[ -n "$DASHBOARD_POD" ]]; then
    echo -e "${CYAN}Errors and warnings from ${DASHBOARD_POD}:${NC}"
    DASHBOARD_LOGS=$(kubectl logs "$DASHBOARD_POD" -n kubernetes-dashboard --tail=50 2>/dev/null || echo "Could not get logs")
    echo "$DASHBOARD_LOGS" | grep -i "error\|failed\|warn\|exception" | sed 's/^/  /' || echo "  No errors or warnings found in logs"

    # Also show recent log entries to provide context
    echo -e "${CYAN}Most recent log entries:${NC}"
    echo "$DASHBOARD_LOGS" | tail -n 10 | sed 's/^/  /'
  else
    echo -e "${RED}No dashboard pod found${NC}"
  fi

  # Check traefik logs
  echo -e "${YELLOW}Checking Traefik logs for dashboard routing...${NC}"
  TRAEFIK_POD=$(kubectl get pod -n kube-system -l "app.kubernetes.io/name=traefik,app.kubernetes.io/instance=traefik-kube-system" -o name 2>/dev/null | head -1)
  if [[ -n "$TRAEFIK_POD" ]]; then
    echo -e "${CYAN}Dashboard-related entries from ${TRAEFIK_POD}:${NC}"
    TRAEFIK_LOGS=$(kubectl logs "$TRAEFIK_POD" -n kube-system --tail=100 2>/dev/null || echo "Could not get logs")

    # Look for dashboard-related entries and errors
    echo "$TRAEFIK_LOGS" | grep -i "dashboard\|kubernetes-dashboard" | sed 's/^/  /' || echo "  No dashboard-related entries found"

    echo -e "${CYAN}Recent errors from Traefik:${NC}"
    echo "$TRAEFIK_LOGS" | grep -i "error\|failed\|warn\|exception" | tail -n 10 | sed 's/^/  /' || echo "  No errors found in recent logs"
  else
    echo -e "${RED}No Traefik pod found${NC}"
  fi

  # Additional information for troubleshooting
  echo -e "${YELLOW}Checking for TLS certificate for dashboard domain...${NC}"
  kubectl get certificate -n kubernetes-dashboard 2>/dev/null || echo "No certificates found in kubernetes-dashboard namespace"

  echo -e "${YELLOW}Checking secrets for TLS certificates...${NC}"
  kubectl get secrets -n kubernetes-dashboard -l certmanager.k8s.io/certificate-name 2>/dev/null || \
  kubectl get secrets -n kubernetes-dashboard | grep -i "tls\|cert" || echo "No TLS certificate secrets found"
fi

echo

# Note: Keeping test pod for further troubleshooting
echo -e "${YELLOW}Test pod 'validation-test' is still running for further troubleshooting.${NC}"
echo -e "${YELLOW}It will terminate after 1 hour or you can manually delete it with:${NC}"
echo -e "${YELLOW}kubectl delete pod validation-test${NC}"

echo -e "${BLUE}============================================================${NC}"

# Function to check if an issue matches a pattern
issue_matches() {
  local pattern=$1
  for issue in "${ISSUES_FOUND[@]}"; do
    if [[ "$issue" == *"$pattern"* ]]; then
      return 0
    fi
  done
  return 1
}

# Display summary and troubleshooting steps if issues were found
if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
  echo -e "${YELLOW}Validation found ${#ISSUES_FOUND[@]} issues:${NC}"
  for ((i=0; i<${#ISSUES_FOUND[@]}; i++)); do
    echo -e "${RED}$(($i+1)). ${ISSUES_FOUND[$i]}${NC}"
  done

  echo
  echo -e "${BOLD}Troubleshooting Recommendations:${NC}"

  # Core recommendation
  echo -e "${BOLD}Primary Fix:${NC}"
  echo -e "${CYAN}Run the complete setup script to fix all issues at once:${NC}"
  echo -e "${YELLOW}cd ${WC_HOME} && ./setup/cluster/install-all.sh${NC}"

  echo
  echo -e "${BOLD}Component-Specific Fixes:${NC}"

  # MetalLB specific recommendations
  if issue_matches "MetalLB" || issue_matches "LoadBalancer" || issue_matches "IP allocation" || issue_matches "address"; then
    echo -e "${CYAN}For MetalLB and IP allocation issues:${NC}"
    echo -e "   1. Run the MetalLB setup script: ${YELLOW}cd ${WC_HOME} && ./setup/cluster/metallb/install.sh${NC}"
    echo -e "   2. Check for conflicting services: ${YELLOW}kubectl get svc -A --field-selector type=LoadBalancer${NC}"
    echo -e "   3. If you have conflicting IP allocations, edit the service that shouldn't have the IP:"
    echo -e "      ${YELLOW}kubectl edit svc <service-name> -n <namespace>${NC}"
    echo -e "      Remove the metallb.universe.tf/loadBalancerIPs annotation"
    echo -e "   4. Check MetalLB logs for errors: ${YELLOW}kubectl logs -n metallb-system -l app.kubernetes.io/name=metallb${NC}"
  fi

  # Dashboard specific recommendations
  if issue_matches "Dashboard" || issue_matches "dashboard"; then
    echo -e "${CYAN}For dashboard issues:${NC}"
    echo -e "   ${YELLOW}cd ${WC_HOME} && ./setup/cluster/kubernetes-dashboard/install.sh${NC}"
    echo -e "   Alternatively, use port-forwarding to access the dashboard: ${YELLOW}kubectl port-forward -n kubernetes-dashboard svc/kubernetes-dashboard 8443:443${NC}"
    echo -e "   Get authentication token with: ${YELLOW}kubectl -n kubernetes-dashboard create token dashboard-admin${NC}"
  fi

  # CoreDNS specific recommendations
  if issue_matches "DNS"; then
    echo -e "${CYAN}For DNS resolution issues:${NC}"
    echo -e "   ${YELLOW}cd ${WC_HOME} && ./setup/cluster/coredns/install.sh${NC}"
    echo -e "   Verify DNS resolution: ${YELLOW}kubectl exec -it $(kubectl get pod -l k8s-app=kube-dns -n kube-system -o name | head -1) -n kube-system -- nslookup dashboard.${INTERNAL_DOMAIN}${NC}"
  fi

  # Traefik/IngressRoute issues
  if issue_matches "IngressRoute" || issue_matches "ServersTransport" || issue_matches "Middleware"; then
    echo -e "${CYAN}For Traefik routing issues:${NC}"
    echo -e "   1. Check Traefik installation: ${YELLOW}cd ${WC_HOME} && ./setup/cluster/traefik/install.sh${NC}"
    echo -e "   2. Re-run dashboard setup: ${YELLOW}cd ${WC_HOME} && ./setup/cluster/kubernetes-dashboard/install.sh${NC}"
    echo -e "   3. Check Traefik status: ${YELLOW}kubectl get pods -n traefik -l app.kubernetes.io/name=traefik${NC}"
  fi

  # Certificate issues
  if issue_matches "certificate" || issue_matches "TLS"; then
    echo -e "${CYAN}For certificate issues:${NC}"
    echo -e "   1. Check certificate status: ${YELLOW}kubectl get certificate,certificaterequest -A${NC}"
    echo -e "   2. Re-run cert-manager setup: ${YELLOW}cd ${WC_HOME} && ./setup/cluster/cert-manager/install.sh${NC}"
  fi

  echo
  echo -e "${BOLD}Debugging Steps:${NC}"
  echo -e "1. ${CYAN}View component logs:${NC}"
  echo -e "   ${YELLOW}kubectl logs -n NAMESPACE PODNAME${NC}"
  echo -e "2. ${CYAN}Check pod status:${NC}"
  echo -e "   ${YELLOW}kubectl get pods --all-namespaces${NC}"
  echo -e "3. ${CYAN}Check all IngressRoutes:${NC}"
  echo -e "   ${YELLOW}kubectl get ingressroute --all-namespaces${NC}"
  echo -e "4. ${CYAN}Re-run validation after fixes:${NC}"
  echo -e "   ${YELLOW}cd ${WC_HOME} && ./setup/cluster/validate-setup.sh${NC}"
else
  echo -e "${GREEN}All validation checks passed! Your infrastructure is set up correctly.${NC}"
  echo -e "${CYAN}✓ Dashboard is accessible at: https://dashboard.${INTERNAL_DOMAIN}${NC}"
  echo -e "${CYAN}✓ Get authentication token with: kubectl -n kubernetes-dashboard create token dashboard-admin${NC}"
  echo
  echo -e "${YELLOW}Next Steps:${NC}"
  echo -e "1. Access the dashboard and verify cluster health"
  echo -e "2. Deploy your applications and services"
  echo -e "3. Set up monitoring and logging"
fi

echo -e "${BLUE}============================================================${NC}"