#!/bin/bash set -e set -o pipefail # Source common utilities source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/wild-common.sh" # Initialize Wild-Cloud environment init_wild_env # Phase tracking variables SKIP_INSTALLER=false SKIP_HARDWARE=false SKIP_CONFIGS=false # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --skip-installer) SKIP_INSTALLER=true shift ;; --skip-hardware) SKIP_HARDWARE=true shift ;; --skip-configs) SKIP_CONFIGS=true shift ;; -h|--help) echo "Usage: $0 [phase-options]" echo "" echo "Set up Kubernetes cluster infrastructure (Phases 1-3)." echo "" echo "Control Options:" echo " --skip-installer Skip Installer image generation" echo " --skip-hardware Skip Node hardware detection" echo " --skip-configs Skip Machine config generation" echo " -h, --help Show this help message" echo "" echo "Prerequisites:" echo " - Run 'wild-setup-scaffold' first to initialize the cloud" echo "" echo "After completion:" echo " - Run 'wild-setup-services' to install cluster services" exit 0 ;; -*) echo "Unknown option $1" echo "Usage: $0 [phase-options]" echo "Use --help for full usage information" exit 1 ;; *) echo "Unexpected argument: $1" echo "Usage: $0 [phase-options]" echo "Use --help for full usage information" exit 1 ;; esac done # Check if we're in a wild-cloud directory check_wild_directory # Check basic configuration check_basic_config print_header "Wild-Cloud Cluster Setup" print_info "Setting up cluster infrastructure" echo "" # ============================================================================= # Talos asset download # ============================================================================= if [ "${SKIP_INSTALLER}" = false ]; then print_header "Installer Image Generation" print_info "Running wild-cluster-node-image-create..." wild-cluster-node-image-create print_success "Installer image generated" echo "" else print_info "Skipping: Installer Image Generation" fi # ============================================================================= # Node Hardware Detection # ============================================================================= if [ "${SKIP_HARDWARE}" = false ]; then print_header "Node Hardware Detection" # Configure network settings if [ -z "$(get_current_config "cloud.router.ip")" ]; then print_header "Network Configuration" CURRENT_IP=$(ip route get 8.8.8.8 | awk '{print $7; exit}' 2>/dev/null || echo "192.168.1.100") GATEWAY_IP=$(ip route | grep default | awk '{print $3; exit}' 2>/dev/null || echo "192.168.1.1") SUBNET_PREFIX=$(echo "${CURRENT_IP}" | cut -d. -f1-3) current_router_ip=$(get_current_config "cloud.router.ip") router_ip=$(prompt_with_default "Router/Gateway IP" "${GATEWAY_IP}" "${current_router_ip}") wild-config-set "cloud.router.ip" "${router_ip}" current_dns_ip=$(get_current_config "cloud.dns.ip") dns_ip=$(prompt_with_default "DNS server IP (dnsmasq machine)" "${SUBNET_PREFIX}.50" "${current_dns_ip}") wild-config-set "cloud.dns.ip" "${dns_ip}" current_dhcp_range=$(get_current_config "cloud.dhcpRange") dhcp_range=$(prompt_with_default "DHCP range for dnsmasq" "${SUBNET_PREFIX}.100,${SUBNET_PREFIX}.200" "${current_dhcp_range}") wild-config-set "cloud.dhcpRange" "${dhcp_range}" current_interface=$(get_current_config "cloud.dnsmasq.interface") interface=$(prompt_with_default "Network interface for dnsmasq" "eth0" "${current_interface}") wild-config-set "cloud.dnsmasq.interface" "${interface}" current_external_resolver=$(get_current_config "cloud.dns.externalResolver") external_resolver=$(prompt_with_default "External DNS resolver" "1.1.1.1" "${current_external_resolver}") wild-config-set "cloud.dns.externalResolver" "${external_resolver}" print_success "Network configuration completed" echo "" fi # Configure cluster settings print_header "Kubernetes Cluster Configuration" CURRENT_IP=$(ip route get 8.8.8.8 | awk '{print $7; exit}' 2>/dev/null || echo "192.168.1.100") SUBNET_PREFIX=$(echo "${CURRENT_IP}" | cut -d. -f1-3) # Talos version current_talos_version=$(get_current_config "cluster.nodes.talos.version") if [ -z "$current_talos_version" ] || [ "$current_talos_version" = "null" ]; then talos_version=$(prompt_with_default "Talos version" "v1.10.4" "${current_talos_version}") wild-config-set "cluster.nodes.talos.version" "${talos_version}" else talos_version="$current_talos_version" fi # MetalLB IP address pool current_ip_pool=$(get_current_config "cluster.ipAddressPool") if [ -z "$current_ip_pool" ] || [ "$current_ip_pool" = "null" ]; then ip_pool=$(prompt_with_default "MetalLB IP address pool" "${SUBNET_PREFIX}.80-${SUBNET_PREFIX}.89" "${current_ip_pool}") wild-config-set "cluster.ipAddressPool" "${ip_pool}" else ip_pool="$current_ip_pool" fi # Load balancer IP (automatically set to first address in the pool) current_lb_ip=$(get_current_config "cluster.loadBalancerIp") if [ -z "$current_lb_ip" ] || [ "$current_lb_ip" = "null" ]; then lb_ip=$(echo "${ip_pool}" | cut -d'-' -f1) wild-config-set "cluster.loadBalancerIp" "${lb_ip}" print_info "Set load balancer IP to: ${lb_ip} (first IP in MetalLB pool)" fi # Talos schematic ID current_schematic_id=$(get_current_config "cluster.nodes.talos.schematicId") if [ -z "$current_schematic_id" ] || [ "$current_schematic_id" = "null" ]; then echo "" print_info "Get your Talos schematic ID from: https://factory.talos.dev/" print_info "This customizes Talos with the drivers needed for your hardware." # Look up default schematic ID from talos-schemas.yaml default_schematic_id="" schemas_file="${WC_ROOT}/setup/cluster-nodes/talos-schemas.yaml" if [ -f "$schemas_file" ]; then default_schematic_id=$(yq eval ".talos-schemas.\"${talos_version}\"" "$schemas_file" 2>/dev/null) if [ -n "$default_schematic_id" ] && [ "$default_schematic_id" != "null" ]; then print_info "Default schematic ID available for Talos $talos_version" else default_schematic_id="" fi fi schematic_id=$(prompt_with_default "Talos schematic ID" "${default_schematic_id}" "${current_schematic_id}") wild-config-set "cluster.nodes.talos.schematicId" "${schematic_id}" fi # External DNS current_owner_id=$(get_current_config "cluster.externalDns.ownerId") if [ -z "$current_owner_id" ] || [ "$current_owner_id" = "null" ]; then cluster_name=$(get_current_config "cluster.name") owner_id=$(prompt_with_default "External DNS owner ID" "external-dns-${cluster_name}" "${current_owner_id}") wild-config-set "cluster.externalDns.ownerId" "${owner_id}" fi print_success "Cluster configuration completed" echo "" print_info "This phase will help you register Talos nodes by discovering their hardware." print_info "You'll need nodes booted in maintenance mode and accessible via IP." echo "" # Configure control plane network topology first if [ -z "$(get_current_config "cluster.nodes.control.vip")" ]; then print_header "Control Plane Network Configuration" # Detect current network for suggestions CURRENT_IP=$(ip route get 8.8.8.8 | awk '{print $7; exit}' 2>/dev/null || echo "192.168.1.100") SUBNET_PREFIX=$(echo "${CURRENT_IP}" | cut -d. -f1-3) print_info "Configure control plane nodes (you need at least 3 for HA):" echo "" current_vip=$(get_current_config "cluster.nodes.control.vip") vip=$(prompt_with_default "Control plane virtual IP" "${SUBNET_PREFIX}.90" "${current_vip}") wild-config-set "cluster.nodes.control.vip" "${vip}" # Automatically configure the first three IPs after VIP for control plane nodes vip_last_octet=$(echo "$vip" | cut -d. -f4) vip_prefix=$(echo "$vip" | cut -d. -f1-3) print_info "Configuring control plane nodes using consecutive IPs after VIP:" for i in 1 2 3; do node_ip="${vip_prefix}.$(( vip_last_octet + i ))" print_info " Control plane node $i: $node_ip" # Initialize the node in cluster.nodes.active if not already present if [ -z "$(get_current_config "cluster.nodes.active.\"${node_ip}\".control")" ]; then wild-config-set "cluster.nodes.active.\"${node_ip}\".control" "true" fi done print_success "Control plane network configuration completed" echo "" fi # # Generate initial cluster configuration # print_header "Cluster Configuration Generation" # print_info "Generating base cluster configuration with talosctl gen config..." # wild-cluster-config-generate # Detect and register control plane nodes print_header "Control Plane Node Registration" # Get VIP to determine control plane IPs vip=$(get_current_config "cluster.nodes.control.vip") if [ -z "$vip" ]; then print_error "VIP not configured. Run control plane network configuration first." exit 1 fi vip_last_octet=$(echo "$vip" | cut -d. -f4) vip_prefix=$(echo "$vip" | cut -d. -f1-3) # Process each control plane node IP for i in 1 2 3; do TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))" echo "" print_info "Registering control plane node: $TARGET_IP" # Check if node is already configured existing_interface=$(get_current_config "cluster.nodes.active.\"${TARGET_IP}\".interface") if [ -n "$existing_interface" ] && [ "$existing_interface" != "null" ]; then print_success "Node $TARGET_IP already configured" print_info " - Interface: $existing_interface" print_info " - Disk: $(get_current_config "cluster.nodes.active.\"${TARGET_IP}\".disk")" # Generate machine config for this node if necessary. NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes" CONFIG_FILE="${NODE_SETUP_DIR}/final/${TARGET_IP}.yaml" if [ ! -f "$CONFIG_FILE" ]; then print_info "Generating missing machine configuration for $TARGET_IP..." if wild-cluster-node-machine-config-generate "$TARGET_IP"; then print_success "Machine configuration generated for $TARGET_IP" else print_warning "Failed to generate machine configuration for $TARGET_IP" fi else print_info " ✓ Machine config exists: $CONFIG_FILE" fi continue fi read -p "Do you want to register control plane node $TARGET_IP now? (y/N): " -r register_node if [[ ! $register_node =~ ^[Yy]$ ]]; then print_info "Skipping node $TARGET_IP registration" continue fi # Register node in config.yaml. # First try to detect at target IP. print_info "Attempting detection at target IP $TARGET_IP..." DETECTION_IP="$TARGET_IP" NODE_INFO="" if wild-node-detect "$TARGET_IP" >/dev/null 2>&1; then NODE_INFO=$(wild-node-detect "$TARGET_IP") print_success "Node detected at target IP $TARGET_IP" else # Fall back to maintenance IP print_warning "Node not accessible at target IP $TARGET_IP" read -p "Enter maintenance IP for this node: " -r MAINTENANCE_IP if [ -z "$MAINTENANCE_IP" ]; then print_warning "Skipping node $TARGET_IP registration" continue fi print_info "Attempting detection at maintenance IP $MAINTENANCE_IP..." if wild-node-detect "$MAINTENANCE_IP" >/dev/null 2>&1; then NODE_INFO=$(wild-node-detect "$MAINTENANCE_IP") DETECTION_IP="$MAINTENANCE_IP" # Store maintenance IP for reference wild-config-set "cluster.nodes.active.\"${TARGET_IP}\".maintenanceIp" "$MAINTENANCE_IP" print_success "Node detected at maintenance IP $MAINTENANCE_IP" else print_error "Failed to detect node at $MAINTENANCE_IP" continue fi fi if [ -n "$NODE_INFO" ]; then # Parse JSON response INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface') SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk') AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks | join(", ")') print_success "Hardware detected:" print_info " - Interface: $INTERFACE" print_info " - Available disks: $AVAILABLE_DISKS" print_info " - Selected disk: $SELECTED_DISK" # Allow user to override disk selection echo "" read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk if [[ $use_disk =~ ^[Nn]$ ]]; then echo "Available disks:" echo "$NODE_INFO" | jq -r '.disks[]' | nl -w2 -s') ' read -p "Enter disk number: " -r disk_num SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))]") if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then print_error "Invalid disk selection" continue fi print_info "Selected disk: $SELECTED_DISK" fi # Update config.yaml with hardware info. print_info "Updating configuration for $TARGET_IP..." wild-config-set "cluster.nodes.active.\"${TARGET_IP}\".interface" "$INTERFACE" wild-config-set "cluster.nodes.active.\"${TARGET_IP}\".disk" "$SELECTED_DISK" wild-config-set "cluster.nodes.active.\"${TARGET_IP}\".control" "true" print_success "Node $TARGET_IP registered successfully" # Generate machine config. print_info "Generating machine configuration for $TARGET_IP..." if wild-cluster-node-machine-config-generate "$TARGET_IP"; then print_success "Machine configuration generated for $TARGET_IP" # Ask if user wants to apply the configuration now echo "" read -p "Apply configuration to node $TARGET_IP now? (y/N): " -r apply_config if [[ $apply_config =~ ^[Yy]$ ]]; then if [ "$DETECTION_IP" != "$TARGET_IP" ]; then # Node is in maintenance mode, use insecure flag print_info "Applying configuration in insecure mode (maintenance mode)..." wild-cluster-node-up "$TARGET_IP" --insecure else # Node is already configured, use secure mode print_info "Applying configuration..." wild-cluster-node-up "$TARGET_IP" fi else print_info "Configuration not applied. You can apply it later with:" print_info " wild-cluster-node-up $TARGET_IP --insecure" fi else print_warning "Failed to generate machine configuration for $TARGET_IP" fi fi done # Register worker nodes echo "" print_info "Configure worker nodes (optional):" while true; do echo "" read -p "Do you want to register a worker node? (y/N): " -r register_worker if [[ $register_worker =~ ^[Yy]$ ]]; then read -p "Enter maintenance IP for worker node: " -r WORKER_IP if [ -z "$WORKER_IP" ]; then print_warning "No IP provided, skipping worker node" continue fi print_info "Running wild-node-detect for worker node $WORKER_IP..." # Run detection and capture both output and stderr for debugging DETECTION_OUTPUT=$(mktemp) DETECTION_ERROR=$(mktemp) if wild-node-detect "$WORKER_IP" >"$DETECTION_OUTPUT" 2>"$DETECTION_ERROR"; then WORKER_INFO=$(cat "$DETECTION_OUTPUT") print_success "Worker node detected at IP $WORKER_IP" rm -f "$DETECTION_OUTPUT" "$DETECTION_ERROR" else print_error "Failed to detect hardware for worker node $WORKER_IP" print_info "Detection error output:" cat "$DETECTION_ERROR" >&2 print_info "Make sure the node is running in maintenance mode and accessible" rm -f "$DETECTION_OUTPUT" "$DETECTION_ERROR" continue fi if [ -n "$WORKER_INFO" ]; then # Parse JSON response INTERFACE=$(echo "$WORKER_INFO" | jq -r '.interface') SELECTED_DISK=$(echo "$WORKER_INFO" | jq -r '.selected_disk') AVAILABLE_DISKS=$(echo "$WORKER_INFO" | jq -r '.disks | join(", ")') print_success "Hardware detected for worker node $WORKER_IP:" print_info " - Interface: $INTERFACE" print_info " - Available disks: $AVAILABLE_DISKS" print_info " - Selected disk: $SELECTED_DISK" # Allow user to override disk selection echo "" read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk if [[ $use_disk =~ ^[Nn]$ ]]; then echo "Available disks:" echo "$WORKER_INFO" | jq -r '.disks[]' | nl -w2 -s') ' read -p "Enter disk number: " -r disk_num SELECTED_DISK=$(echo "$WORKER_INFO" | jq -r ".disks[$((disk_num-1))]") if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then print_error "Invalid disk selection" continue fi print_info "Selected disk: $SELECTED_DISK" fi # Update config.yaml with worker hardware info print_info "Updating config.yaml for worker node $WORKER_IP..." # Store under unified cluster.nodes.active. wild-config-set "cluster.nodes.active.\"${WORKER_IP}\".interface" "$INTERFACE" wild-config-set "cluster.nodes.active.\"${WORKER_IP}\".disk" "$SELECTED_DISK" wild-config-set "cluster.nodes.active.\"${WORKER_IP}\".control" "false" print_success "Worker node $WORKER_IP registered successfully:" print_info " - IP: $WORKER_IP" print_info " - Interface: $INTERFACE" print_info " - Disk: $SELECTED_DISK" # Generate machine config immediately print_info "Generating machine configuration for $WORKER_IP..." if wild-cluster-node-machine-config-generate "$WORKER_IP"; then print_success "Machine configuration generated for $WORKER_IP" # Ask if user wants to apply the configuration now echo "" read -p "Apply configuration to worker node $WORKER_IP now? (y/N): " -r apply_config if [[ $apply_config =~ ^[Yy]$ ]]; then # Worker nodes are typically in maintenance mode during setup print_info "Applying configuration in insecure mode (maintenance mode)..." wild-cluster-node-up "$WORKER_IP" --insecure else print_info "Configuration not applied. You can apply it later with:" print_info " wild-cluster-node-up $WORKER_IP --insecure" fi else print_warning "Failed to generate machine configuration for $WORKER_IP" fi else print_error "Failed to detect hardware for worker node $WORKER_IP" continue fi else break fi done print_success "Completed Node hardware detection" echo "" else print_info "Skipping Node Hardware Detection" fi # ============================================================================= # COMPLETION # ============================================================================= print_header "Wild-Cloud Cluster Setup Complete!" print_success "Cluster infrastructure setup completed!" echo ""