Simplifies cluster service setup.

This commit is contained in:
2025-09-28 15:25:00 -07:00
parent 912a877051
commit 838903e27d
9 changed files with 458 additions and 615 deletions

View File

@@ -61,12 +61,15 @@ else
init_wild_env
fi
print_header "Wild Cloud Cluster Setup"
# =============================================================================
# Configuration
# =============================================================================
print_header "Configuration"
prompt_if_unset_config "operator.email" "Operator email address"
prompt_if_unset_config "cluster.name" "Cluster name" "wild-cluster"
@@ -134,59 +137,39 @@ fi
if [ "${SKIP_HARDWARE}" = false ]; then
print_header "Control Plane Configuration"
print_header "Control node registration"
# Automatically configure the first three IPs after VIP for control plane nodes
vip_last_octet=$(echo "$vip" | cut -d. -f4)
vip_prefix=$(echo "$vip" | cut -d. -f1-3)
# Detect and register control plane nodes
print_header "Control Plane Node Registration"
# Process each control plane node
for i in 1 2 3; do
NODE_NAME="${HOSTNAME_PREFIX}control-${i}"
TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))"
print_info "Registering control plane node: $NODE_NAME (IP: $TARGET_IP)"
print_info "Checking for control plane node: $NODE_NAME (IP: $TARGET_IP)"
# Initialize the node in cluster.nodes.active if not already present
if [ -z "$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".role")" ]; then
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$TARGET_IP"
fi
# Check if node is already configured
existing_interface=$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".interface")
if [ -n "$existing_interface" ] && [ "$existing_interface" != "null" ]; then
print_success "Node $NODE_NAME already configured"
print_info " - Interface: $existing_interface"
print_info " - Disk: $(wild-config "cluster.nodes.active.\"${NODE_NAME}\".disk")"
# Generate machine config patch for this node if necessary.
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
CONFIG_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml"
if [ ! -f "$CONFIG_FILE" ]; then
print_info "Generating missing machine configuration patch for $NODE_NAME..."
if wild-cluster-node-patch-generate "$NODE_NAME"; then
print_success "Machine configuration patch generated for $NODE_NAME"
else
print_warning "Failed to generate machine configuration patch for $NODE_NAME"
fi
else
print_info " ✓ Machine configuration patch exists: $CONFIG_FILE"
fi
if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then
print_success "Node $NODE_NAME already registered."
continue
fi
read -p "Do you want to bring up control plane node $NODE_NAME ($TARGET_IP) now? (y/N): " -r register_node
if [[ ! $register_node =~ ^[Yy]$ ]]; then
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.role"; then
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
fi
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.targetIp"; then
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
fi
print_info "${NODE_NAME} not found. Please ensure the node is powered on and running Talos in maintenance mode."
read -p "Is $NODE_NAME in maintenance mode now? (Y/n): " -r register_node
if [[ $register_node =~ ^[Nn]$ ]]; then
print_info "Skipping bringing up node $NODE_NAME registration"
continue
fi
# Register node in config.yaml.
# First try to detect at target IP.
# Detect node hardware
print_info "Attempting detection at target IP $TARGET_IP..."
DETECTION_IP="$TARGET_IP"
NODE_INFO=""
@@ -195,115 +178,147 @@ if [ "${SKIP_HARDWARE}" = false ]; then
NODE_INFO=$(wild-node-detect "$TARGET_IP")
print_success "Node detected at target IP $TARGET_IP"
else
# Fall back to maintenance IP
# Fall back to current IP
print_warning "Node not accessible at target IP $TARGET_IP"
read -p "Enter maintenance IP for this node: " -r MAINTENANCE_IP
if [ -z "$MAINTENANCE_IP" ]; then
read -p "Enter current IP for this node: " -r CURRENT_IP
if [ -z "$CURRENT_IP" ]; then
print_warning "Skipping node $NODE_NAME registration"
continue
fi
print_info "Attempting detection at maintenance IP $MAINTENANCE_IP..."
if wild-node-detect "$MAINTENANCE_IP" >/dev/null 2>&1; then
NODE_INFO=$(wild-node-detect "$MAINTENANCE_IP")
DETECTION_IP="$MAINTENANCE_IP"
# Store maintenance IP for reference
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".maintenanceIp" "$MAINTENANCE_IP"
print_success "Node detected at maintenance IP $MAINTENANCE_IP"
print_info "Attempting detection at current IP $CURRENT_IP..."
if wild-node-detect "$CURRENT_IP" >/dev/null 2>&1; then
NODE_INFO=$(wild-node-detect "$CURRENT_IP")
DETECTION_IP="$CURRENT_IP"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$CURRENT_IP"
print_success "Node detected at current IP $CURRENT_IP"
else
print_error "Failed to detect node at $MAINTENANCE_IP"
print_error "Failed to detect node at $CURRENT_IP"
continue
fi
fi
if [ -n "$NODE_INFO" ]; then
# Parse JSON response
INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface')
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk')
AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -)
print_success "Hardware detected:"
print_info " - Interface: $INTERFACE"
print_info " - Available disks: $AVAILABLE_DISKS"
print_info " - Selected disk: $SELECTED_DISK"
# Allow user to override disk selection
echo ""
read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk
if [[ $use_disk =~ ^[Nn]$ ]]; then
echo "Available disks:"
echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
read -p "Enter disk number: " -r disk_num
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path")
if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then
print_error "Invalid disk selection"
continue
fi
print_info "Selected disk: $SELECTED_DISK"
fi
# Update config.yaml with hardware info.
print_info "Updating configuration for $NODE_NAME..."
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
# Copy current Talos version and schematic ID to this node
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
echo ""
read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (y/N): " -r apply_config
if [[ $apply_config =~ ^[Yy]$ ]]; then
if [ "$DETECTION_IP" != "$TARGET_IP" ]; then
# Node is in maintenance mode, use insecure flag
print_info "Applying configuration in insecure mode (maintenance mode)..."
wild-cluster-node-up "$NODE_NAME" --insecure
else
# Node is already configured, use secure mode
print_info "Applying configuration..."
wild-cluster-node-up "$NODE_NAME"
fi
# Bootstrap the cluster after the first node is up.
if [ "$i" -eq 1 ]; then
read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready?: " -r is_ready
if [[ $is_ready =~ ^[Yy]$ ]]; then
print_info "Bootstrapping control plane node $TARGET_IP..."
talosctl config endpoint "$TARGET_IP"
# Attempt to bootstrap the cluster
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
print_success "Control plane node $TARGET_IP bootstrapped successfully!"
if ! [ -n "$NODE_INFO" ]; then
print_error "No hardware information received from node"
continue
fi
INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface')
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk')
AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -)
print_success "Hardware detected:"
print_info " - Interface: $INTERFACE"
print_info " - Available disks: $AVAILABLE_DISKS"
print_info " - Selected disk: $SELECTED_DISK"
# User system disk selection
echo ""
read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk
if [[ $use_disk =~ ^[Nn]$ ]]; then
echo "Available disks:"
echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
read -p "Enter disk number: " -r disk_num
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path")
if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then
print_error "Invalid disk selection"
continue
fi
print_info "Selected disk: $SELECTED_DISK"
fi
# Update config.yaml with hardware info.
print_info "Updating configuration for $NODE_NAME..."
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
# Copy current Talos version and schematic ID to this node
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
# The node is now configured. Bring it up.
echo ""
read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (Y/n): " -r apply_config
if [[ ! $apply_config =~ ^[Nn]$ ]]; then
if [ "$DETECTION_IP" != "$TARGET_IP" ]; then
# Node is in maintenance mode, use insecure flag
print_info "Applying configuration in insecure mode (maintenance mode)..."
wild-cluster-node-up "$NODE_NAME" --insecure
else
# Node is already up, no insecure flag needed
print_info "Applying configuration..."
wild-cluster-node-up "$NODE_NAME" --force
fi
# Bootstrap the cluster after the first node is up.
if [ "$i" -eq 1 ]; then
read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready? (Y/n): " -r is_ready
if [[ ! $is_ready =~ ^[Nn]$ ]]; then
print_info "Bootstrapping control plane node $TARGET_IP..."
talosctl config endpoint "$TARGET_IP"
# Attempt to bootstrap the cluster
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
print_success "Control plane node $TARGET_IP bootstrapped successfully!"
else
# Check if the error is because it's already bootstrapped
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
print_info "Cluster is already bootstrapped on $TARGET_IP"
else
# Check if the error is because it's already bootstrapped
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
print_info "Cluster is already bootstrapped on $TARGET_IP"
else
print_error "Failed to bootstrap control plane node $TARGET_IP"
print_info "Bootstrap output:"
cat /tmp/bootstrap_output.log
rm -f /tmp/bootstrap_output.log
continue
print_error "Failed to bootstrap control plane node $TARGET_IP"
print_info "Bootstrap output:"
cat /tmp/bootstrap_output.log
rm -f /tmp/bootstrap_output.log
continue
fi
fi
rm -f /tmp/bootstrap_output.log
# Wait for VIP to become available before using it
print_info "Waiting for VIP $vip to become available..."
max_attempts=30
attempt=1
vip_ready=false
while [ $attempt -le $max_attempts ]; do
if ping -c 1 -W 2 "$vip" >/dev/null 2>&1; then
# VIP responds to ping, now test Talos API
if talosctl -e "$vip" -n "$vip" version >/dev/null 2>&1; then
print_success "VIP $vip is ready (attempt $attempt/$max_attempts)"
vip_ready=true
break
fi
fi
rm -f /tmp/bootstrap_output.log
print_info "VIP not ready, waiting... (attempt $attempt/$max_attempts)"
sleep 2
attempt=$((attempt + 1))
done
if [ "$vip_ready" = true ]; then
talosctl config endpoint "$vip"
print_info "Talos endpoint set to control plane VIP: $vip"
talosctl kubeconfig "$vip"
print_success "Talos kubeconfig updated for control plane VIP: $vip"
if talosctl kubeconfig "$vip"; then
print_success "Talos kubeconfig updated for control plane VIP: $vip"
else
print_error "Failed to get kubeconfig from VIP: $vip"
print_info "You can try again later with: talosctl kubeconfig $vip"
fi
else
print_error "VIP $vip did not become available after $max_attempts attempts"
print_warning "Falling back to direct node access"
print_info "Talos endpoint remains set to: $TARGET_IP"
print_info "You can try switching to VIP later with: talosctl config endpoint $vip"
fi
fi
else
print_info "Configuration not applied. You can apply it later with:"
print_info " wild-cluster-node-up $NODE_NAME --insecure"
fi
else
print_info "Configuration not applied. You can apply it later with:"
print_info " wild-cluster-node-up $NODE_NAME --insecure"
fi
done
# Register worker nodes
@@ -377,6 +392,7 @@ if [ "${SKIP_HARDWARE}" = false ]; then
# Store under unified cluster.nodes.active.<node-name>
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$WORKER_IP"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
@@ -397,8 +413,8 @@ if [ "${SKIP_HARDWARE}" = false ]; then
# Ask if user wants to apply the configuration now
echo ""
read -p "Apply configuration to worker node $NODE_NAME now? (y/N): " -r apply_config
if [[ $apply_config =~ ^[Yy]$ ]]; then
read -p "Apply configuration to worker node $NODE_NAME now? (Y/n): " -r apply_config
if [[ $apply_config =~ ^[Yy]$ ]] || [[ -z "$apply_config" ]]; then
# Worker nodes are typically in maintenance mode during setup
print_info "Applying configuration in insecure mode (maintenance mode)..."
wild-cluster-node-up "$NODE_NAME" --insecure