#!/bin/bash set -e set -o pipefail # Parse arguments SKIP_INSTALLER=false SKIP_HARDWARE=false while [[ $# -gt 0 ]]; do case $1 in --skip-installer) SKIP_INSTALLER=true shift ;; --skip-hardware) SKIP_HARDWARE=true shift ;; -h|--help) echo "Usage: $0 [phase-options]" echo "" echo "Set up Kubernetes cluster infrastructure (Phases 1-3)." echo "" echo "Control Options:" echo " --skip-installer Skip Installer image generation" echo " --skip-hardware Skip Node hardware detection" echo " --skip-configs Skip Machine config generation" echo " -h, --help Show this help message" echo "" echo "Prerequisites:" echo " - Run 'wild-setup-scaffold' first to initialize the cloud" echo "" echo "After completion:" echo " - Run 'wild-setup-services' to install cluster services" exit 0 ;; -*) echo "Unknown option $1" echo "Usage: $0 [phase-options]" echo "Use --help for full usage information" exit 1 ;; *) echo "Unexpected argument: $1" echo "Usage: $0 [phase-options]" echo "Use --help for full usage information" exit 1 ;; esac done # Initialize Wild Cloud environment if [ -z "${WC_ROOT}" ]; then print "WC_ROOT is not set." exit 1 else source "${WC_ROOT}/scripts/common.sh" init_wild_env fi print_header "Wild Cloud Cluster Setup" print_info "Setting up cluster infrastructure" echo "" # Generate initial cluster configuration if ! wild-cluster-config-generate; then print_error "Failed to generate cluster configuration" exit 1 fi # Configure Talos cli with our new cluster context CLUSTER_NAME=$(wild-config "cluster.name") HAS_CONTEXT=$(talosctl config contexts | grep -c "$CLUSTER_NAME" || true) if [ "$HAS_CONTEXT" -eq 0 ]; then print_info "No Talos context found for cluster $CLUSTER_NAME, creating..." talosctl config merge ${WC_HOME}/setup/cluster-nodes/generated/talosconfig talosctl config use "$CLUSTER_NAME" print_success "Talos context for $CLUSTER_NAME created and set as current" fi # Talos asset download if [ "${SKIP_INSTALLER}" = false ]; then print_header "Installer Image Generation" print_info "Running wild-cluster-node-boot-assets-download..." wild-cluster-node-boot-assets-download print_success "Installer image generated" echo "" else print_info "Skipping: Installer Image Generation" fi # ============================================================================= # Configuration # ============================================================================= prompt_if_unset_config "operator.email" "Operator email address" # Configure hostname prefix for unique node names on LAN prompt_if_unset_config "cluster.hostnamePrefix" "Hostname prefix (optional, e.g. 'test-' for unique names on LAN)" "" HOSTNAME_PREFIX=$(wild-config "cluster.hostnamePrefix") # Configure network settings CURRENT_IP=$(ip route get 8.8.8.8 | awk '{print $7; exit}' 2>/dev/null || echo "192.168.1.100") GATEWAY_IP=$(ip route | grep default | awk '{print $3; exit}' 2>/dev/null || echo "192.168.1.1") SUBNET_PREFIX=$(echo "${CURRENT_IP}" | cut -d. -f1-3) prompt_if_unset_config "cloud.router.ip" "Router/Gateway IP" "${GATEWAY_IP}" prompt_if_unset_config "cloud.dns.ip" "DNS server IP (dnsmasq machine)" "${SUBNET_PREFIX}.50" prompt_if_unset_config "cloud.dhcpRange" "DHCP range for dnsmasq" "${SUBNET_PREFIX}.100,${SUBNET_PREFIX}.200" prompt_if_unset_config "cloud.dnsmasq.interface" "Network interface for dnsmasq" "eth0" prompt_if_unset_config "cloud.dns.externalResolver" "External DNS resolver" "1.1.1.1" # MetalLB IP address pool prompt_if_unset_config "cluster.ipAddressPool" "MetalLB IP address pool" "${SUBNET_PREFIX}.80-${SUBNET_PREFIX}.89" ip_pool=$(wild-config "cluster.ipAddressPool") # Load balancer IP (automatically set to first address in the pool if not set) current_lb_ip=$(wild-config "cluster.loadBalancerIp") if [ -z "$current_lb_ip" ] || [ "$current_lb_ip" = "null" ]; then lb_ip=$(echo "${ip_pool}" | cut -d'-' -f1) wild-config-set "cluster.loadBalancerIp" "${lb_ip}" print_info "Set load balancer IP to: ${lb_ip} (first IP in MetalLB pool)" fi # Talos version prompt_if_unset_config "cluster.nodes.talos.version" "Talos version" "v1.10.4" talos_version=$(wild-config "cluster.nodes.talos.version") # Talos schematic ID current_schematic_id=$(wild-config "cluster.nodes.talos.schematicId") if [ -z "$current_schematic_id" ] || [ "$current_schematic_id" = "null" ]; then echo "" print_info "Get your Talos schematic ID from: https://factory.talos.dev/" print_info "This customizes Talos with the drivers needed for your hardware." # Use current schematic ID from config as default default_schematic_id=$(wild-config "cluster.nodes.talos.schematicId") if [ -n "$default_schematic_id" ] && [ "$default_schematic_id" != "null" ]; then print_info "Using schematic ID from config for Talos $talos_version" else default_schematic_id="" fi schematic_id=$(prompt_with_default "Talos schematic ID" "${default_schematic_id}" "${current_schematic_id}") wild-config-set "cluster.nodes.talos.schematicId" "${schematic_id}" fi # External DNS cluster_name=$(wild-config "cluster.name") prompt_if_unset_config "cluster.externalDns.ownerId" "External DNS owner ID" "external-dns-${cluster_name}" # ============================================================================= # Node setup # ============================================================================= if [ "${SKIP_HARDWARE}" = false ]; then print_header "Control Plane Configuration" print_info "Configure control plane nodes (you need at least 3 for HA):" echo "" prompt_if_unset_config "cluster.nodes.control.vip" "Control plane virtual IP" "${SUBNET_PREFIX}.90" vip=$(wild-config "cluster.nodes.control.vip") # Automatically configure the first three IPs after VIP for control plane nodes vip_last_octet=$(echo "$vip" | cut -d. -f4) vip_prefix=$(echo "$vip" | cut -d. -f1-3) # Detect and register control plane nodes print_header "Control Plane Node Registration" # Process each control plane node for i in 1 2 3; do NODE_NAME="${HOSTNAME_PREFIX}control-${i}" TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))" echo "" print_info "Registering control plane node: $NODE_NAME (IP: $TARGET_IP)" # Initialize the node in cluster.nodes.active if not already present if [ -z "$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".role")" ]; then wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$TARGET_IP" fi # Check if node is already configured existing_interface=$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".interface") if [ -n "$existing_interface" ] && [ "$existing_interface" != "null" ]; then print_success "Node $NODE_NAME already configured" print_info " - Interface: $existing_interface" print_info " - Disk: $(wild-config "cluster.nodes.active.\"${NODE_NAME}\".disk")" # Generate machine config patch for this node if necessary. NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes" CONFIG_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml" if [ ! -f "$CONFIG_FILE" ]; then print_info "Generating missing machine configuration patch for $NODE_NAME..." if wild-cluster-node-patch-generate "$NODE_NAME"; then print_success "Machine configuration patch generated for $NODE_NAME" else print_warning "Failed to generate machine configuration patch for $NODE_NAME" fi else print_info " ✓ Machine configuration patch exists: $CONFIG_FILE" fi continue fi read -p "Do you want to bring up control plane node $NODE_NAME ($TARGET_IP) now? (y/N): " -r register_node if [[ ! $register_node =~ ^[Yy]$ ]]; then print_info "Skipping bringing up node $NODE_NAME registration" continue fi # Register node in config.yaml. # First try to detect at target IP. print_info "Attempting detection at target IP $TARGET_IP..." DETECTION_IP="$TARGET_IP" NODE_INFO="" if wild-node-detect "$TARGET_IP" >/dev/null 2>&1; then NODE_INFO=$(wild-node-detect "$TARGET_IP") print_success "Node detected at target IP $TARGET_IP" else # Fall back to maintenance IP print_warning "Node not accessible at target IP $TARGET_IP" read -p "Enter maintenance IP for this node: " -r MAINTENANCE_IP if [ -z "$MAINTENANCE_IP" ]; then print_warning "Skipping node $NODE_NAME registration" continue fi print_info "Attempting detection at maintenance IP $MAINTENANCE_IP..." if wild-node-detect "$MAINTENANCE_IP" >/dev/null 2>&1; then NODE_INFO=$(wild-node-detect "$MAINTENANCE_IP") DETECTION_IP="$MAINTENANCE_IP" # Store maintenance IP for reference wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".maintenanceIp" "$MAINTENANCE_IP" print_success "Node detected at maintenance IP $MAINTENANCE_IP" else print_error "Failed to detect node at $MAINTENANCE_IP" continue fi fi if [ -n "$NODE_INFO" ]; then # Parse JSON response INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface') SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk') AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -) print_success "Hardware detected:" print_info " - Interface: $INTERFACE" print_info " - Available disks: $AVAILABLE_DISKS" print_info " - Selected disk: $SELECTED_DISK" # Allow user to override disk selection echo "" read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk if [[ $use_disk =~ ^[Nn]$ ]]; then echo "Available disks:" echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') ' read -p "Enter disk number: " -r disk_num SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path") if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then print_error "Invalid disk selection" continue fi print_info "Selected disk: $SELECTED_DISK" fi # Update config.yaml with hardware info. print_info "Updating configuration for $NODE_NAME..." wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK" # Copy current Talos version and schematic ID to this node current_talos_version=$(wild-config "cluster.nodes.talos.version") current_schematic_id=$(wild-config "cluster.nodes.talos.schematicId") if [ -n "$current_talos_version" ] && [ "$current_talos_version" != "null" ]; then wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$current_talos_version" fi if [ -n "$current_schematic_id" ] && [ "$current_schematic_id" != "null" ]; then wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$current_schematic_id" fi echo "" read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (y/N): " -r apply_config if [[ $apply_config =~ ^[Yy]$ ]]; then if [ "$DETECTION_IP" != "$TARGET_IP" ]; then # Node is in maintenance mode, use insecure flag print_info "Applying configuration in insecure mode (maintenance mode)..." wild-cluster-node-up "$NODE_NAME" --insecure else # Node is already configured, use secure mode print_info "Applying configuration..." wild-cluster-node-up "$NODE_NAME" fi # Bootstrap the cluster after the first node is up. if [ "$i" -eq 1 ]; then read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready?: " -r is_ready if [[ $is_ready =~ ^[Yy]$ ]]; then print_info "Bootstrapping control plane node $TARGET_IP..." talos config endpoint "$TARGET_IP" # Attempt to bootstrap the cluster if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then print_success "Control plane node $TARGET_IP bootstrapped successfully!" else # Check if the error is because it's already bootstrapped if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then print_info "Cluster is already bootstrapped on $TARGET_IP" else print_error "Failed to bootstrap control plane node $TARGET_IP" print_info "Bootstrap output:" cat /tmp/bootstrap_output.log rm -f /tmp/bootstrap_output.log continue fi fi rm -f /tmp/bootstrap_output.log talosctl config endpoint "$vip" print_info "Talos endpoint set to control plane VIP: $vip" talosctl kubeconfig "$vip" print_success "Talos kubeconfig updated for control plane VIP: $vip" fi fi else print_info "Configuration not applied. You can apply it later with:" print_info " wild-cluster-node-up $NODE_NAME --insecure" fi fi done # Register worker nodes echo "" print_info "Configure worker nodes (optional):" WORKER_COUNT=1 while true; do echo "" read -p "Do you want to register a worker node? (y/N): " -r register_worker if [[ $register_worker =~ ^[Yy]$ ]]; then # Find first available worker number while [ -n "$(wild-config "cluster.nodes.active.\"${HOSTNAME_PREFIX}worker-${WORKER_COUNT}\".role" 2>/dev/null)" ] && [ "$(wild-config "cluster.nodes.active.\"${HOSTNAME_PREFIX}worker-${WORKER_COUNT}\".role" 2>/dev/null)" != "null" ]; do WORKER_COUNT=$((WORKER_COUNT + 1)) done NODE_NAME="${HOSTNAME_PREFIX}worker-${WORKER_COUNT}" read -p "Enter current IP for worker node $NODE_NAME: " -r WORKER_IP if [ -z "$WORKER_IP" ]; then print_warning "No IP provided, skipping worker node" continue fi print_info "Running wild-node-detect for worker node $NODE_NAME ($WORKER_IP)..." # Run detection and capture both output and stderr for debugging DETECTION_OUTPUT=$(mktemp) DETECTION_ERROR=$(mktemp) if wild-node-detect "$WORKER_IP" >"$DETECTION_OUTPUT" 2>"$DETECTION_ERROR"; then WORKER_INFO=$(cat "$DETECTION_OUTPUT") print_success "Worker node $NODE_NAME detected at IP $WORKER_IP" rm -f "$DETECTION_OUTPUT" "$DETECTION_ERROR" else print_error "Failed to detect hardware for worker node $NODE_NAME ($WORKER_IP)" print_info "Detection error output:" cat "$DETECTION_ERROR" >&2 print_info "Make sure the node is running in maintenance mode and accessible" rm -f "$DETECTION_OUTPUT" "$DETECTION_ERROR" continue fi if [ -n "$WORKER_INFO" ]; then # Parse JSON response INTERFACE=$(echo "$WORKER_INFO" | jq -r '.interface') SELECTED_DISK=$(echo "$WORKER_INFO" | jq -r '.selected_disk') AVAILABLE_DISKS=$(echo "$WORKER_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -) print_success "Hardware detected for worker node $NODE_NAME:" print_info " - Interface: $INTERFACE" print_info " - Available disks: $AVAILABLE_DISKS" print_info " - Selected disk: $SELECTED_DISK" # Allow user to override disk selection echo "" read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk if [[ $use_disk =~ ^[Nn]$ ]]; then echo "Available disks:" echo "$WORKER_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') ' read -p "Enter disk number: " -r disk_num SELECTED_DISK=$(echo "$WORKER_INFO" | jq -r ".disks[$((disk_num-1))].path") if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then print_error "Invalid disk selection" continue fi print_info "Selected disk: $SELECTED_DISK" fi # Update config.yaml with worker hardware info print_info "Updating config.yaml for worker node $NODE_NAME..." # Store under unified cluster.nodes.active. wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$WORKER_IP" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK" # Copy current Talos version and schematic ID to this node current_talos_version=$(wild-config "cluster.nodes.talos.version") current_schematic_id=$(wild-config "cluster.nodes.talos.schematicId") if [ -n "$current_talos_version" ] && [ "$current_talos_version" != "null" ]; then wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$current_talos_version" fi if [ -n "$current_schematic_id" ] && [ "$current_schematic_id" != "null" ]; then wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$current_schematic_id" fi print_success "Worker node $NODE_NAME registered successfully:" print_info " - Name: $NODE_NAME" print_info " - IP: $WORKER_IP" print_info " - Interface: $INTERFACE" print_info " - Disk: $SELECTED_DISK" # Generate machine config immediately print_info "Generating machine configuration for $NODE_NAME..." if wild-cluster-node-patch-generate "$NODE_NAME"; then print_success "Machine configuration generated for $NODE_NAME" # Ask if user wants to apply the configuration now echo "" read -p "Apply configuration to worker node $NODE_NAME now? (y/N): " -r apply_config if [[ $apply_config =~ ^[Yy]$ ]]; then # Worker nodes are typically in maintenance mode during setup print_info "Applying configuration in insecure mode (maintenance mode)..." wild-cluster-node-up "$NODE_NAME" --insecure else print_info "Configuration not applied. You can apply it later with:" print_info " wild-cluster-node-up $NODE_NAME --insecure" fi else print_warning "Failed to generate machine configuration for $NODE_NAME" fi else print_error "Failed to detect hardware for worker node $NODE_NAME" continue fi WORKER_COUNT=$((WORKER_COUNT + 1)) else break fi done print_success "Completed Node hardware detection" echo "" else print_info "Skipping Node Hardware Detection" fi # ============================================================================= # COMPLETION # ============================================================================= print_header "Wild Cloud Cluster Setup Complete!" print_success "Cluster infrastructure setup completed!" echo ""