Simplifies cluster service setup.
This commit is contained in:
@@ -61,12 +61,15 @@ else
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
|
||||
print_header "Wild Cloud Cluster Setup"
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
|
||||
print_header "Configuration"
|
||||
|
||||
prompt_if_unset_config "operator.email" "Operator email address"
|
||||
|
||||
prompt_if_unset_config "cluster.name" "Cluster name" "wild-cluster"
|
||||
@@ -134,59 +137,39 @@ fi
|
||||
|
||||
if [ "${SKIP_HARDWARE}" = false ]; then
|
||||
|
||||
print_header "Control Plane Configuration"
|
||||
print_header "Control node registration"
|
||||
|
||||
# Automatically configure the first three IPs after VIP for control plane nodes
|
||||
vip_last_octet=$(echo "$vip" | cut -d. -f4)
|
||||
vip_prefix=$(echo "$vip" | cut -d. -f1-3)
|
||||
|
||||
# Detect and register control plane nodes
|
||||
print_header "Control Plane Node Registration"
|
||||
|
||||
# Process each control plane node
|
||||
for i in 1 2 3; do
|
||||
NODE_NAME="${HOSTNAME_PREFIX}control-${i}"
|
||||
TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))"
|
||||
print_info "Registering control plane node: $NODE_NAME (IP: $TARGET_IP)"
|
||||
print_info "Checking for control plane node: $NODE_NAME (IP: $TARGET_IP)"
|
||||
|
||||
# Initialize the node in cluster.nodes.active if not already present
|
||||
if [ -z "$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".role")" ]; then
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$TARGET_IP"
|
||||
fi
|
||||
|
||||
# Check if node is already configured
|
||||
existing_interface=$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".interface")
|
||||
if [ -n "$existing_interface" ] && [ "$existing_interface" != "null" ]; then
|
||||
print_success "Node $NODE_NAME already configured"
|
||||
print_info " - Interface: $existing_interface"
|
||||
print_info " - Disk: $(wild-config "cluster.nodes.active.\"${NODE_NAME}\".disk")"
|
||||
|
||||
# Generate machine config patch for this node if necessary.
|
||||
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
|
||||
CONFIG_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml"
|
||||
if [ ! -f "$CONFIG_FILE" ]; then
|
||||
print_info "Generating missing machine configuration patch for $NODE_NAME..."
|
||||
if wild-cluster-node-patch-generate "$NODE_NAME"; then
|
||||
print_success "Machine configuration patch generated for $NODE_NAME"
|
||||
else
|
||||
print_warning "Failed to generate machine configuration patch for $NODE_NAME"
|
||||
fi
|
||||
else
|
||||
print_info " ✓ Machine configuration patch exists: $CONFIG_FILE"
|
||||
fi
|
||||
if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then
|
||||
print_success "Node $NODE_NAME already registered."
|
||||
continue
|
||||
fi
|
||||
|
||||
read -p "Do you want to bring up control plane node $NODE_NAME ($TARGET_IP) now? (y/N): " -r register_node
|
||||
if [[ ! $register_node =~ ^[Yy]$ ]]; then
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.role"; then
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
|
||||
fi
|
||||
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.targetIp"; then
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
|
||||
fi
|
||||
|
||||
print_info "${NODE_NAME} not found. Please ensure the node is powered on and running Talos in maintenance mode."
|
||||
read -p "Is $NODE_NAME in maintenance mode now? (Y/n): " -r register_node
|
||||
if [[ $register_node =~ ^[Nn]$ ]]; then
|
||||
print_info "Skipping bringing up node $NODE_NAME registration"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Register node in config.yaml.
|
||||
# First try to detect at target IP.
|
||||
# Detect node hardware
|
||||
print_info "Attempting detection at target IP $TARGET_IP..."
|
||||
DETECTION_IP="$TARGET_IP"
|
||||
NODE_INFO=""
|
||||
@@ -195,115 +178,147 @@ if [ "${SKIP_HARDWARE}" = false ]; then
|
||||
NODE_INFO=$(wild-node-detect "$TARGET_IP")
|
||||
print_success "Node detected at target IP $TARGET_IP"
|
||||
else
|
||||
# Fall back to maintenance IP
|
||||
# Fall back to current IP
|
||||
print_warning "Node not accessible at target IP $TARGET_IP"
|
||||
read -p "Enter maintenance IP for this node: " -r MAINTENANCE_IP
|
||||
|
||||
if [ -z "$MAINTENANCE_IP" ]; then
|
||||
read -p "Enter current IP for this node: " -r CURRENT_IP
|
||||
|
||||
if [ -z "$CURRENT_IP" ]; then
|
||||
print_warning "Skipping node $NODE_NAME registration"
|
||||
continue
|
||||
fi
|
||||
|
||||
print_info "Attempting detection at maintenance IP $MAINTENANCE_IP..."
|
||||
if wild-node-detect "$MAINTENANCE_IP" >/dev/null 2>&1; then
|
||||
NODE_INFO=$(wild-node-detect "$MAINTENANCE_IP")
|
||||
DETECTION_IP="$MAINTENANCE_IP"
|
||||
|
||||
# Store maintenance IP for reference
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".maintenanceIp" "$MAINTENANCE_IP"
|
||||
print_success "Node detected at maintenance IP $MAINTENANCE_IP"
|
||||
|
||||
print_info "Attempting detection at current IP $CURRENT_IP..."
|
||||
if wild-node-detect "$CURRENT_IP" >/dev/null 2>&1; then
|
||||
NODE_INFO=$(wild-node-detect "$CURRENT_IP")
|
||||
DETECTION_IP="$CURRENT_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$CURRENT_IP"
|
||||
print_success "Node detected at current IP $CURRENT_IP"
|
||||
else
|
||||
print_error "Failed to detect node at $MAINTENANCE_IP"
|
||||
print_error "Failed to detect node at $CURRENT_IP"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$NODE_INFO" ]; then
|
||||
# Parse JSON response
|
||||
INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface')
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk')
|
||||
AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -)
|
||||
|
||||
print_success "Hardware detected:"
|
||||
print_info " - Interface: $INTERFACE"
|
||||
print_info " - Available disks: $AVAILABLE_DISKS"
|
||||
print_info " - Selected disk: $SELECTED_DISK"
|
||||
|
||||
# Allow user to override disk selection
|
||||
echo ""
|
||||
read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk
|
||||
if [[ $use_disk =~ ^[Nn]$ ]]; then
|
||||
echo "Available disks:"
|
||||
echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
|
||||
read -p "Enter disk number: " -r disk_num
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path")
|
||||
if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then
|
||||
print_error "Invalid disk selection"
|
||||
continue
|
||||
fi
|
||||
print_info "Selected disk: $SELECTED_DISK"
|
||||
fi
|
||||
|
||||
# Update config.yaml with hardware info.
|
||||
print_info "Updating configuration for $NODE_NAME..."
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
|
||||
|
||||
# Copy current Talos version and schematic ID to this node
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
||||
|
||||
echo ""
|
||||
read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (y/N): " -r apply_config
|
||||
if [[ $apply_config =~ ^[Yy]$ ]]; then
|
||||
if [ "$DETECTION_IP" != "$TARGET_IP" ]; then
|
||||
# Node is in maintenance mode, use insecure flag
|
||||
print_info "Applying configuration in insecure mode (maintenance mode)..."
|
||||
wild-cluster-node-up "$NODE_NAME" --insecure
|
||||
else
|
||||
# Node is already configured, use secure mode
|
||||
print_info "Applying configuration..."
|
||||
wild-cluster-node-up "$NODE_NAME"
|
||||
fi
|
||||
|
||||
# Bootstrap the cluster after the first node is up.
|
||||
if [ "$i" -eq 1 ]; then
|
||||
read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready?: " -r is_ready
|
||||
if [[ $is_ready =~ ^[Yy]$ ]]; then
|
||||
print_info "Bootstrapping control plane node $TARGET_IP..."
|
||||
talosctl config endpoint "$TARGET_IP"
|
||||
|
||||
# Attempt to bootstrap the cluster
|
||||
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
|
||||
print_success "Control plane node $TARGET_IP bootstrapped successfully!"
|
||||
if ! [ -n "$NODE_INFO" ]; then
|
||||
print_error "No hardware information received from node"
|
||||
continue
|
||||
fi
|
||||
|
||||
INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface')
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk')
|
||||
AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -)
|
||||
|
||||
print_success "Hardware detected:"
|
||||
print_info " - Interface: $INTERFACE"
|
||||
print_info " - Available disks: $AVAILABLE_DISKS"
|
||||
print_info " - Selected disk: $SELECTED_DISK"
|
||||
|
||||
# User system disk selection
|
||||
echo ""
|
||||
read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk
|
||||
if [[ $use_disk =~ ^[Nn]$ ]]; then
|
||||
echo "Available disks:"
|
||||
echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
|
||||
read -p "Enter disk number: " -r disk_num
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path")
|
||||
if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then
|
||||
print_error "Invalid disk selection"
|
||||
continue
|
||||
fi
|
||||
print_info "Selected disk: $SELECTED_DISK"
|
||||
fi
|
||||
|
||||
# Update config.yaml with hardware info.
|
||||
print_info "Updating configuration for $NODE_NAME..."
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
|
||||
|
||||
# Copy current Talos version and schematic ID to this node
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
||||
|
||||
# The node is now configured. Bring it up.
|
||||
echo ""
|
||||
read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (Y/n): " -r apply_config
|
||||
if [[ ! $apply_config =~ ^[Nn]$ ]]; then
|
||||
if [ "$DETECTION_IP" != "$TARGET_IP" ]; then
|
||||
# Node is in maintenance mode, use insecure flag
|
||||
print_info "Applying configuration in insecure mode (maintenance mode)..."
|
||||
wild-cluster-node-up "$NODE_NAME" --insecure
|
||||
else
|
||||
# Node is already up, no insecure flag needed
|
||||
print_info "Applying configuration..."
|
||||
wild-cluster-node-up "$NODE_NAME" --force
|
||||
fi
|
||||
|
||||
# Bootstrap the cluster after the first node is up.
|
||||
if [ "$i" -eq 1 ]; then
|
||||
read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready? (Y/n): " -r is_ready
|
||||
if [[ ! $is_ready =~ ^[Nn]$ ]]; then
|
||||
print_info "Bootstrapping control plane node $TARGET_IP..."
|
||||
talosctl config endpoint "$TARGET_IP"
|
||||
|
||||
# Attempt to bootstrap the cluster
|
||||
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
|
||||
print_success "Control plane node $TARGET_IP bootstrapped successfully!"
|
||||
else
|
||||
# Check if the error is because it's already bootstrapped
|
||||
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
|
||||
print_info "Cluster is already bootstrapped on $TARGET_IP"
|
||||
else
|
||||
# Check if the error is because it's already bootstrapped
|
||||
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
|
||||
print_info "Cluster is already bootstrapped on $TARGET_IP"
|
||||
else
|
||||
print_error "Failed to bootstrap control plane node $TARGET_IP"
|
||||
print_info "Bootstrap output:"
|
||||
cat /tmp/bootstrap_output.log
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
continue
|
||||
print_error "Failed to bootstrap control plane node $TARGET_IP"
|
||||
print_info "Bootstrap output:"
|
||||
cat /tmp/bootstrap_output.log
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
|
||||
# Wait for VIP to become available before using it
|
||||
print_info "Waiting for VIP $vip to become available..."
|
||||
max_attempts=30
|
||||
attempt=1
|
||||
vip_ready=false
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if ping -c 1 -W 2 "$vip" >/dev/null 2>&1; then
|
||||
# VIP responds to ping, now test Talos API
|
||||
if talosctl -e "$vip" -n "$vip" version >/dev/null 2>&1; then
|
||||
print_success "VIP $vip is ready (attempt $attempt/$max_attempts)"
|
||||
vip_ready=true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
print_info "VIP not ready, waiting... (attempt $attempt/$max_attempts)"
|
||||
sleep 2
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
|
||||
if [ "$vip_ready" = true ]; then
|
||||
talosctl config endpoint "$vip"
|
||||
print_info "Talos endpoint set to control plane VIP: $vip"
|
||||
|
||||
talosctl kubeconfig "$vip"
|
||||
print_success "Talos kubeconfig updated for control plane VIP: $vip"
|
||||
if talosctl kubeconfig "$vip"; then
|
||||
print_success "Talos kubeconfig updated for control plane VIP: $vip"
|
||||
else
|
||||
print_error "Failed to get kubeconfig from VIP: $vip"
|
||||
print_info "You can try again later with: talosctl kubeconfig $vip"
|
||||
fi
|
||||
else
|
||||
print_error "VIP $vip did not become available after $max_attempts attempts"
|
||||
print_warning "Falling back to direct node access"
|
||||
print_info "Talos endpoint remains set to: $TARGET_IP"
|
||||
print_info "You can try switching to VIP later with: talosctl config endpoint $vip"
|
||||
fi
|
||||
fi
|
||||
|
||||
else
|
||||
print_info "Configuration not applied. You can apply it later with:"
|
||||
print_info " wild-cluster-node-up $NODE_NAME --insecure"
|
||||
fi
|
||||
|
||||
|
||||
else
|
||||
print_info "Configuration not applied. You can apply it later with:"
|
||||
print_info " wild-cluster-node-up $NODE_NAME --insecure"
|
||||
fi
|
||||
|
||||
done
|
||||
|
||||
# Register worker nodes
|
||||
@@ -377,6 +392,7 @@ if [ "${SKIP_HARDWARE}" = false ]; then
|
||||
# Store under unified cluster.nodes.active.<node-name>
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$WORKER_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
|
||||
|
||||
@@ -397,8 +413,8 @@ if [ "${SKIP_HARDWARE}" = false ]; then
|
||||
|
||||
# Ask if user wants to apply the configuration now
|
||||
echo ""
|
||||
read -p "Apply configuration to worker node $NODE_NAME now? (y/N): " -r apply_config
|
||||
if [[ $apply_config =~ ^[Yy]$ ]]; then
|
||||
read -p "Apply configuration to worker node $NODE_NAME now? (Y/n): " -r apply_config
|
||||
if [[ $apply_config =~ ^[Yy]$ ]] || [[ -z "$apply_config" ]]; then
|
||||
# Worker nodes are typically in maintenance mode during setup
|
||||
print_info "Applying configuration in insecure mode (maintenance mode)..."
|
||||
wild-cluster-node-up "$NODE_NAME" --insecure
|
||||
|
||||
Reference in New Issue
Block a user