#!/bin/bash set -e set -o pipefail # Parse arguments SKIP_HARDWARE=false while [[ $# -gt 0 ]]; do case $1 in --skip-hardware) SKIP_HARDWARE=true shift ;; -h|--help) echo "Usage: $0 [phase-options]" echo "" echo "Set up Kubernetes cluster infrastructure (Phases 1-3)." echo "" echo "Control Options:" echo " --skip-installer Skip Installer image generation" echo " --skip-hardware Skip Node hardware detection" echo " -h, --help Show this help message" echo "" echo "Prerequisites:" echo " - Run 'wild-setup-scaffold' first to initialize the cloud" echo "" echo "After completion:" echo " - Run 'wild-setup-services' to install cluster services" exit 0 ;; -*) echo "Unknown option $1" echo "Usage: $0 [phase-options]" echo "Use --help for full usage information" exit 1 ;; *) echo "Unexpected argument: $1" echo "Usage: $0 [phase-options]" echo "Use --help for full usage information" exit 1 ;; esac done # Initialize Wild Cloud environment if [ -z "${WC_ROOT}" ]; then echo "ERROR: WC_ROOT is not set." exit 1 else source "${WC_ROOT}/scripts/common.sh" init_wild_env fi print_header "Wild Cloud Cluster Setup" # ============================================================================= # Configuration # ============================================================================= print_header "Configuration" prompt_if_unset_config "operator.email" "Operator email address" prompt_if_unset_config "cluster.name" "Cluster name" "wild-cluster" CLUSTER_NAME=$(wild-config "cluster.name") # Configure hostname prefix for unique node names on LAN prompt_if_unset_config "cluster.hostnamePrefix" "Hostname prefix (optional, e.g. 'test-' for unique names on LAN)" "" HOSTNAME_PREFIX=$(wild-config "cluster.hostnamePrefix") # Configure network settings CURRENT_IP=$(ip route get 8.8.8.8 | awk '{print $7; exit}' 2>/dev/null || echo "192.168.1.100") GATEWAY_IP=$(ip route | grep default | awk '{print $3; exit}' 2>/dev/null || echo "192.168.1.1") SUBNET_PREFIX=$(echo "${CURRENT_IP}" | cut -d. -f1-3) prompt_if_unset_config "cloud.router.ip" "Router/Gateway IP" "${GATEWAY_IP}" prompt_if_unset_config "cloud.dns.ip" "DNS server IP (dnsmasq machine)" "${SUBNET_PREFIX}.50" prompt_if_unset_config "cloud.dhcpRange" "DHCP range for dnsmasq" "${SUBNET_PREFIX}.100,${SUBNET_PREFIX}.200" prompt_if_unset_config "cloud.dnsmasq.interface" "Network interface for dnsmasq" "eth0" prompt_if_unset_config "cloud.dns.externalResolver" "External DNS resolver" "1.1.1.1" # MetalLB IP address pool prompt_if_unset_config "cluster.ipAddressPool" "MetalLB IP address pool" "${SUBNET_PREFIX}.80-${SUBNET_PREFIX}.89" ip_pool=$(wild-config "cluster.ipAddressPool") # Load balancer IP (automatically set to first address in the pool if not set) default_lb_ip=$(echo "${ip_pool}" | cut -d'-' -f1) prompt_if_unset_config "cluster.loadBalancerIp" "Load balancer IP" "${default_lb_ip}" # Talos version prompt_if_unset_config "cluster.nodes.talos.version" "Talos version" "v1.11.0" talos_version=$(wild-config "cluster.nodes.talos.version") # Talos schematic ID prompt_if_unset_config "cluster.nodes.talos.schematicId" "Talos schematic ID" "56774e0894c8a3a3a9834a2aea65f24163cacf9506abbcbdc3ba135eaca4953f" schematic_id=$(wild-config "cluster.nodes.talos.schematicId") # External DNS prompt_if_unset_config "cluster.externalDns.ownerId" "External DNS owner ID" "external-dns-${CLUSTER_NAME}" # ============================================================================= # TALOS CLUSTER CONFIGURATION # ============================================================================= prompt_if_unset_config "cluster.nodes.control.vip" "Control plane virtual IP" "${SUBNET_PREFIX}.90" vip=$(wild-config "cluster.nodes.control.vip") # Generate initial cluster configuration if ! wild-cluster-config-generate; then print_error "Failed to generate cluster configuration" exit 1 fi # Configure Talos cli with our new cluster context HAS_CONTEXT=$(talosctl config contexts | grep -c "$CLUSTER_NAME" || true) if [ "$HAS_CONTEXT" -eq 0 ]; then print_info "No Talos context found for cluster $CLUSTER_NAME, creating..." talosctl config merge ${WC_HOME}/setup/cluster-nodes/generated/talosconfig talosctl config context "$CLUSTER_NAME" print_success "Talos context for $CLUSTER_NAME created and set as current" fi # ============================================================================= # Node setup # ============================================================================= if [ "${SKIP_HARDWARE}" = false ]; then print_header "Control Plane Node Setup" # Automatically configure the first three IPs after VIP for control plane nodes vip_last_octet=$(echo "$vip" | cut -d. -f4) vip_prefix=$(echo "$vip" | cut -d. -f1-3) # Set up control plane nodes for i in 1 2 3; do NODE_NAME="${HOSTNAME_PREFIX}control-${i}" TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))" print_info "Setting up control plane node: $NODE_NAME (IP: $TARGET_IP)" # Pre-configure node role and target IP wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id" # Check if node is already configured if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then print_success "Node $NODE_NAME already configured" echo "" read -p "Re-deploy node $NODE_NAME? (y/N): " -r redeploy_node if [[ $redeploy_node =~ ^[Yy]$ ]]; then if ! wild-node-setup "$NODE_NAME"; then print_error "Failed to set up node $NODE_NAME" continue fi else continue fi else # Node needs initial setup print_info "Node $NODE_NAME requires hardware detection and setup" echo "" read -p "Set up node $NODE_NAME now? (Y/n): " -r setup_node if [[ $setup_node =~ ^[Nn]$ ]]; then print_info "Skipping node $NODE_NAME setup" continue fi # Run complete node setup if ! wild-node-setup "$NODE_NAME"; then print_error "Failed to set up node $NODE_NAME" print_info "You can retry later with: wild-node-setup $NODE_NAME" continue fi fi # Bootstrap the cluster after the first node is up if [ "$i" -eq 1 ]; then echo "" read -p "Bootstrap the cluster on $NODE_NAME? (Y/n): " -r bootstrap_cluster if [[ ! $bootstrap_cluster =~ ^[Nn]$ ]]; then print_header "Bootstrapping Cluster: $NODE_NAME" talosctl config endpoint "$TARGET_IP" if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then print_success "Cluster bootstrap initiated successfully." else if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then print_info "Cluster is already bootstrapped." else print_error "Failed to bootstrap cluster." print_info "Bootstrap output:" cat /tmp/bootstrap_output.log rm -f /tmp/bootstrap_output.log continue fi fi mv -f /tmp/bootstrap_output.log /tmp/bootstrap_output_success.log # Step 1: Verify etcd cluster health print_info -n "Step 1/6: Verifying etcd cluster health." max_attempts=30 for attempt in $(seq 1 $max_attempts); do if talosctl -n "$TARGET_IP" etcd status >/dev/null 2>&1; then echo "" print_success "etcd cluster is healthy." break fi if [ $attempt -eq $max_attempts ]; then echo "" print_error "etcd cluster not healthy after $max_attempts attempts." print_info "Troubleshooting steps:" print_info " 1. Check etcd service: talosctl -n $TARGET_IP service etcd" print_info " 2. Check etcd logs: talosctl -n $TARGET_IP logs etcd" print_info " 3. Check etcd status details: talosctl -n $TARGET_IP etcd status" print_info " 4. Verify bootstrap completed: talosctl -n $TARGET_IP get members" exit 1 fi printf "." sleep 10 done # Step 2: Wait for VIP to be assigned to interface print_info -n "Step 2/6: Waiting for VIP $vip to be assigned to interface." max_attempts=90 for attempt in $(seq 1 $max_attempts); do if talosctl -n "$TARGET_IP" get addresses | grep -q "$vip/32"; then echo "" print_success "VIP $vip assigned to interface." break fi if [ $attempt -eq $max_attempts ]; then echo "" print_error "VIP $vip was not assigned to interface after $max_attempts attempts" print_info "Troubleshooting steps:" print_info " 1. Check VIP controller logs: talosctl -n $TARGET_IP logs controller-runtime | grep vip" print_info " 2. Check network configuration: talosctl -n $TARGET_IP get addresses" print_info " 3. Verify VIP is within node's network range" exit 1 fi printf "." sleep 10 done # Step 3: Wait for control plane components to start print_info -n "Step 3/6: Waiting for control plane components to start." max_attempts=60 for attempt in $(seq 1 $max_attempts); do # Check if all three control plane components are running apiserver_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-apiserver.*CONTAINER_RUNNING" || true) controller_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-controller-manager.*CONTAINER_RUNNING" || true) scheduler_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-scheduler.*CONTAINER_RUNNING" || true) if [ "$apiserver_running" -gt 0 ] && [ "$controller_running" -gt 0 ] && [ "$scheduler_running" -gt 0 ]; then echo "" print_success "All control plane components are running (attempt $attempt)." break fi if [ $attempt -eq $max_attempts ]; then echo "" print_error "Control plane components not all running after $max_attempts attempts." print_info "Troubleshooting steps:" print_info " 1. Check kubelet logs: talosctl -n $TARGET_IP logs kubelet" print_info " 2. Check static pod status: talosctl -n $TARGET_IP containers -k | grep kube-" print_info " 3. Restart kubelet if needed: talosctl -n $TARGET_IP service kubelet restart" print_info "Current status:" print_info " API Server running: $apiserver_running" print_info " Controller Manager running: $controller_running" print_info " Scheduler running: $scheduler_running" exit 1 fi # Restart kubelet every 40 attempts to refresh static pod creation if [ $((attempt % 40)) -eq 0 ]; then echo "" print_info "Restarting kubelet to refresh static pod creation (attempt $attempt)..." talosctl -n "$TARGET_IP" service kubelet restart > /dev/null 2>&1 print_info -n "Waiting for control plane components after kubelet restart." sleep 30 # Give kubelet time to restart and create pods fi printf "." sleep 10 done # Step 4: Wait for API server to respond on VIP print_info -n "Step 4/6: Waiting for API server to respond on VIP $vip." max_attempts=60 for attempt in $(seq 1 $max_attempts); do if curl -k -s --max-time 5 "https://$vip:6443/healthz" >/dev/null 2>&1; then echo "" print_success "API server responding on VIP." break fi if [ $attempt -eq $max_attempts ]; then echo "" print_error "API server not responding on VIP $vip after $max_attempts attempts." print_info "Troubleshooting steps:" print_info " 1. Check API server logs: talosctl -n $TARGET_IP logs kubelet | grep apiserver" print_info " 2. Check if API server is running: talosctl -n $TARGET_IP containers -k | grep apiserver" print_info " 3. Test API server on node IP: curl -k https://$TARGET_IP:6443/healthz" exit 1 fi # Attempt kubelet restart every 15 attempts to refresh certificates if [ $((attempt % 15)) -eq 0 ]; then echo "" print_info "Restarting kubelet to refresh API container setup (attempt $attempt)..." talosctl -n "$TARGET_IP" service kubelet restart > /dev/null 2>&1 print_info -n "Waiting for API server to respond after kubelet restart." sleep 30 # Give kubelet time to restart fi printf "." sleep 10 done # Step 5: Configure talosctl endpoint and get kubeconfig print_info "Step 5/6: Configuring cluster access..." talosctl config endpoint "$vip" if ! talosctl kubeconfig --nodes "$vip"; then print_error "Failed to get kubeconfig via VIP." print_info "Troubleshooting steps:" print_info " 1. Check API server logs: talosctl -n $TARGET_IP logs kube-apiserver" print_info " 2. Test API server on node IP: curl -k https://$TARGET_IP:6443/healthz" print_info " 3. Verify network connectivity to VIP" exit 1 else print_success "Kubeconfig retrieved via VIP." fi # Step 6: Verify node registration print_info -n "Step 6/6: Verifying node registration." for reg_attempt in $(seq 1 10); do if kubectl get nodes 2>/dev/null | grep -q "Ready\|NotReady"; then echo "" print_success "Node registered with API server." break fi echo -n "." sleep 10 done if ! kubectl get nodes 2>/dev/null | grep -q "Ready\|NotReady"; then echo "" print_error "Node did not register with API server after multiple attempts." print_info "Troubleshooting steps:" print_info " 1. Check kubelet logs: talosctl -n $TARGET_IP logs kubelet" print_info " 2. Check API server logs: talosctl -n $TARGET_IP logs kube-apiserver" print_info " 3. Verify network connectivity between node and VIP" exit 1 fi print_success "Cluster bootstrap completed!" fi fi done # Worker node setup echo "" print_header "Worker Node Setup (Optional)" WORKER_COUNT=1 while true; do echo "" read -p "Set up a worker node? (y/N): " -r setup_worker if [[ $setup_worker =~ ^[Yy]$ ]]; then # Find next available worker number while wild-config --check "cluster.nodes.active.${HOSTNAME_PREFIX}worker-${WORKER_COUNT}.role" 2>/dev/null; do WORKER_COUNT=$((WORKER_COUNT + 1)) done NODE_NAME="${HOSTNAME_PREFIX}worker-${WORKER_COUNT}" read -p "Enter IP address for worker node $NODE_NAME: " -r WORKER_IP if [ -z "$WORKER_IP" ]; then print_warning "No IP provided, skipping worker node" continue fi # Pre-configure worker node wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id" # Run complete node setup if wild-node-setup "$NODE_NAME"; then print_success "Worker node $NODE_NAME setup completed" WORKER_COUNT=$((WORKER_COUNT + 1)) else print_error "Failed to set up worker node $NODE_NAME" print_info "You can retry later with: wild-node-setup $NODE_NAME" fi else break fi done print_success "Node setup phase completed" else print_info "Skipping node setup (--skip-hardware specified)" fi # ============================================================================= # COMPLETION # ============================================================================= print_header "Wild Cloud Cluster Setup Complete!" print_success "Cluster infrastructure setup completed!" echo "" print_info "Next steps:" echo " 1. Run 'wild-setup-services' to install cluster services" echo " 2. Verify nodes are ready: kubectl get nodes" echo " 3. Check cluster health: wild-health" echo "" print_info "Individual node management:" echo " - Setup additional nodes: wild-node-setup " echo " - Re-detect hardware: wild-node-setup --detect" echo " - Configuration only: wild-node-setup --no-deploy" echo "" print_success "Wild Cloud cluster setup completed!"