428 lines
19 KiB
Bash
Executable File
428 lines
19 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -e
|
|
set -o pipefail
|
|
|
|
# Parse arguments
|
|
|
|
SKIP_HARDWARE=false
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--skip-hardware)
|
|
SKIP_HARDWARE=true
|
|
shift
|
|
;;
|
|
-h|--help)
|
|
echo "Usage: $0 [phase-options]"
|
|
echo ""
|
|
echo "Set up Kubernetes cluster infrastructure (Phases 1-3)."
|
|
echo ""
|
|
echo "Control Options:"
|
|
echo " --skip-installer Skip Installer image generation"
|
|
echo " --skip-hardware Skip Node hardware detection"
|
|
echo " -h, --help Show this help message"
|
|
echo ""
|
|
echo "Prerequisites:"
|
|
echo " - Run 'wild-setup-scaffold' first to initialize the cloud"
|
|
echo ""
|
|
echo "After completion:"
|
|
echo " - Run 'wild-setup-services' to install cluster services"
|
|
exit 0
|
|
;;
|
|
-*)
|
|
echo "Unknown option $1"
|
|
echo "Usage: $0 [phase-options]"
|
|
echo "Use --help for full usage information"
|
|
exit 1
|
|
;;
|
|
*)
|
|
echo "Unexpected argument: $1"
|
|
echo "Usage: $0 [phase-options]"
|
|
echo "Use --help for full usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Initialize Wild Cloud environment
|
|
|
|
if [ -z "${WC_ROOT}" ]; then
|
|
echo "ERROR: WC_ROOT is not set."
|
|
exit 1
|
|
else
|
|
source "${WC_ROOT}/scripts/common.sh"
|
|
init_wild_env
|
|
fi
|
|
|
|
|
|
print_header "Wild Cloud Cluster Setup"
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
print_header "Configuration"
|
|
|
|
prompt_if_unset_config "operator.email" "Operator email address"
|
|
|
|
prompt_if_unset_config "cluster.name" "Cluster name" "wild-cluster"
|
|
CLUSTER_NAME=$(wild-config "cluster.name")
|
|
|
|
# Configure hostname prefix for unique node names on LAN
|
|
prompt_if_unset_config "cluster.hostnamePrefix" "Hostname prefix (optional, e.g. 'test-' for unique names on LAN)" ""
|
|
HOSTNAME_PREFIX=$(wild-config "cluster.hostnamePrefix")
|
|
|
|
# Configure network settings
|
|
CURRENT_IP=$(ip route get 8.8.8.8 | awk '{print $7; exit}' 2>/dev/null || echo "192.168.1.100")
|
|
GATEWAY_IP=$(ip route | grep default | awk '{print $3; exit}' 2>/dev/null || echo "192.168.1.1")
|
|
SUBNET_PREFIX=$(echo "${CURRENT_IP}" | cut -d. -f1-3)
|
|
|
|
prompt_if_unset_config "cloud.router.ip" "Router/Gateway IP" "${GATEWAY_IP}"
|
|
prompt_if_unset_config "cloud.dns.ip" "DNS server IP (dnsmasq machine)" "${SUBNET_PREFIX}.50"
|
|
prompt_if_unset_config "cloud.dhcpRange" "DHCP range for dnsmasq" "${SUBNET_PREFIX}.100,${SUBNET_PREFIX}.200"
|
|
prompt_if_unset_config "cloud.dnsmasq.interface" "Network interface for dnsmasq" "eth0"
|
|
prompt_if_unset_config "cloud.dns.externalResolver" "External DNS resolver" "1.1.1.1"
|
|
|
|
# MetalLB IP address pool
|
|
prompt_if_unset_config "cluster.ipAddressPool" "MetalLB IP address pool" "${SUBNET_PREFIX}.80-${SUBNET_PREFIX}.89"
|
|
ip_pool=$(wild-config "cluster.ipAddressPool")
|
|
|
|
# Load balancer IP (automatically set to first address in the pool if not set)
|
|
default_lb_ip=$(echo "${ip_pool}" | cut -d'-' -f1)
|
|
prompt_if_unset_config "cluster.loadBalancerIp" "Load balancer IP" "${default_lb_ip}"
|
|
|
|
# Talos version
|
|
prompt_if_unset_config "cluster.nodes.talos.version" "Talos version" "v1.11.0"
|
|
talos_version=$(wild-config "cluster.nodes.talos.version")
|
|
|
|
# Talos schematic ID
|
|
prompt_if_unset_config "cluster.nodes.talos.schematicId" "Talos schematic ID" "56774e0894c8a3a3a9834a2aea65f24163cacf9506abbcbdc3ba135eaca4953f"
|
|
schematic_id=$(wild-config "cluster.nodes.talos.schematicId")
|
|
|
|
# External DNS
|
|
prompt_if_unset_config "cluster.externalDns.ownerId" "External DNS owner ID" "external-dns-${CLUSTER_NAME}"
|
|
|
|
# =============================================================================
|
|
# TALOS CLUSTER CONFIGURATION
|
|
# =============================================================================
|
|
|
|
prompt_if_unset_config "cluster.nodes.control.vip" "Control plane virtual IP" "${SUBNET_PREFIX}.90"
|
|
vip=$(wild-config "cluster.nodes.control.vip")
|
|
|
|
# Generate initial cluster configuration
|
|
if ! wild-cluster-config-generate; then
|
|
print_error "Failed to generate cluster configuration"
|
|
exit 1
|
|
fi
|
|
|
|
# Configure Talos cli with our new cluster context
|
|
HAS_CONTEXT=$(talosctl config contexts | grep -c "$CLUSTER_NAME" || true)
|
|
if [ "$HAS_CONTEXT" -eq 0 ]; then
|
|
print_info "No Talos context found for cluster $CLUSTER_NAME, creating..."
|
|
talosctl config merge ${WC_HOME}/setup/cluster-nodes/generated/talosconfig
|
|
talosctl config context "$CLUSTER_NAME"
|
|
print_success "Talos context for $CLUSTER_NAME created and set as current"
|
|
fi
|
|
|
|
# =============================================================================
|
|
# Node setup
|
|
# =============================================================================
|
|
|
|
if [ "${SKIP_HARDWARE}" = false ]; then
|
|
print_header "Control Plane Node Setup"
|
|
|
|
# Automatically configure the first three IPs after VIP for control plane nodes
|
|
vip_last_octet=$(echo "$vip" | cut -d. -f4)
|
|
vip_prefix=$(echo "$vip" | cut -d. -f1-3)
|
|
|
|
# Set up control plane nodes
|
|
for i in 1 2 3; do
|
|
NODE_NAME="${HOSTNAME_PREFIX}control-${i}"
|
|
TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))"
|
|
|
|
print_info "Setting up control plane node: $NODE_NAME (IP: $TARGET_IP)"
|
|
|
|
# Pre-configure node role and target IP
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
|
|
|
# Check if node is already configured
|
|
if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then
|
|
print_success "Node $NODE_NAME already configured"
|
|
echo ""
|
|
read -p "Re-deploy node $NODE_NAME? (y/N): " -r redeploy_node
|
|
if [[ $redeploy_node =~ ^[Yy]$ ]]; then
|
|
if ! wild-node-setup "$NODE_NAME"; then
|
|
print_error "Failed to set up node $NODE_NAME"
|
|
continue
|
|
fi
|
|
else
|
|
continue
|
|
fi
|
|
else
|
|
# Node needs initial setup
|
|
print_info "Node $NODE_NAME requires hardware detection and setup"
|
|
echo ""
|
|
read -p "Set up node $NODE_NAME now? (Y/n): " -r setup_node
|
|
if [[ $setup_node =~ ^[Nn]$ ]]; then
|
|
print_info "Skipping node $NODE_NAME setup"
|
|
continue
|
|
fi
|
|
|
|
# Run complete node setup
|
|
if ! wild-node-setup "$NODE_NAME"; then
|
|
print_error "Failed to set up node $NODE_NAME"
|
|
print_info "You can retry later with: wild-node-setup $NODE_NAME"
|
|
continue
|
|
fi
|
|
fi
|
|
|
|
# Bootstrap the cluster after the first node is up
|
|
if [ "$i" -eq 1 ]; then
|
|
echo ""
|
|
read -p "Bootstrap the cluster on $NODE_NAME? (Y/n): " -r bootstrap_cluster
|
|
if [[ ! $bootstrap_cluster =~ ^[Nn]$ ]]; then
|
|
print_header "Bootstrapping Cluster: $NODE_NAME"
|
|
talosctl config endpoint "$TARGET_IP"
|
|
|
|
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
|
|
print_success "Cluster bootstrap initiated successfully."
|
|
else
|
|
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
|
|
print_info "Cluster is already bootstrapped."
|
|
else
|
|
print_error "Failed to bootstrap cluster."
|
|
print_info "Bootstrap output:"
|
|
cat /tmp/bootstrap_output.log
|
|
rm -f /tmp/bootstrap_output.log
|
|
continue
|
|
fi
|
|
fi
|
|
mv -f /tmp/bootstrap_output.log /tmp/bootstrap_output_success.log
|
|
|
|
# Step 1: Verify etcd cluster health
|
|
print_info -n "Step 1/6: Verifying etcd cluster health."
|
|
max_attempts=30
|
|
for attempt in $(seq 1 $max_attempts); do
|
|
if talosctl -n "$TARGET_IP" etcd status >/dev/null 2>&1; then
|
|
echo ""
|
|
print_success "etcd cluster is healthy."
|
|
break
|
|
fi
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
echo ""
|
|
print_error "etcd cluster not healthy after $max_attempts attempts."
|
|
print_info "Troubleshooting steps:"
|
|
print_info " 1. Check etcd service: talosctl -n $TARGET_IP service etcd"
|
|
print_info " 2. Check etcd logs: talosctl -n $TARGET_IP logs etcd"
|
|
print_info " 3. Check etcd status details: talosctl -n $TARGET_IP etcd status"
|
|
print_info " 4. Verify bootstrap completed: talosctl -n $TARGET_IP get members"
|
|
exit 1
|
|
fi
|
|
printf "."
|
|
sleep 10
|
|
done
|
|
|
|
# Step 2: Wait for VIP to be assigned to interface
|
|
print_info -n "Step 2/6: Waiting for VIP $vip to be assigned to interface."
|
|
max_attempts=90
|
|
for attempt in $(seq 1 $max_attempts); do
|
|
if talosctl -n "$TARGET_IP" get addresses | grep -q "$vip/32"; then
|
|
echo ""
|
|
print_success "VIP $vip assigned to interface."
|
|
break
|
|
fi
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
echo ""
|
|
print_error "VIP $vip was not assigned to interface after $max_attempts attempts"
|
|
print_info "Troubleshooting steps:"
|
|
print_info " 1. Check VIP controller logs: talosctl -n $TARGET_IP logs controller-runtime | grep vip"
|
|
print_info " 2. Check network configuration: talosctl -n $TARGET_IP get addresses"
|
|
print_info " 3. Verify VIP is within node's network range"
|
|
exit 1
|
|
fi
|
|
printf "."
|
|
sleep 10
|
|
done
|
|
|
|
# Step 3: Wait for control plane components to start
|
|
print_info -n "Step 3/6: Waiting for control plane components to start."
|
|
max_attempts=60
|
|
for attempt in $(seq 1 $max_attempts); do
|
|
# Check if all three control plane components are running
|
|
apiserver_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-apiserver.*CONTAINER_RUNNING" || true)
|
|
controller_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-controller-manager.*CONTAINER_RUNNING" || true)
|
|
scheduler_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-scheduler.*CONTAINER_RUNNING" || true)
|
|
|
|
if [ "$apiserver_running" -gt 0 ] && [ "$controller_running" -gt 0 ] && [ "$scheduler_running" -gt 0 ]; then
|
|
echo ""
|
|
print_success "All control plane components are running (attempt $attempt)."
|
|
break
|
|
fi
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
echo ""
|
|
print_error "Control plane components not all running after $max_attempts attempts."
|
|
print_info "Troubleshooting steps:"
|
|
print_info " 1. Check kubelet logs: talosctl -n $TARGET_IP logs kubelet"
|
|
print_info " 2. Check static pod status: talosctl -n $TARGET_IP containers -k | grep kube-"
|
|
print_info " 3. Restart kubelet if needed: talosctl -n $TARGET_IP service kubelet restart"
|
|
print_info "Current status:"
|
|
print_info " API Server running: $apiserver_running"
|
|
print_info " Controller Manager running: $controller_running"
|
|
print_info " Scheduler running: $scheduler_running"
|
|
exit 1
|
|
fi
|
|
# Restart kubelet every 40 attempts to refresh static pod creation
|
|
if [ $((attempt % 40)) -eq 0 ]; then
|
|
echo ""
|
|
print_info "Restarting kubelet to refresh static pod creation (attempt $attempt)..."
|
|
talosctl -n "$TARGET_IP" service kubelet restart > /dev/null 2>&1
|
|
print_info -n "Waiting for control plane components after kubelet restart."
|
|
sleep 30 # Give kubelet time to restart and create pods
|
|
fi
|
|
printf "."
|
|
sleep 10
|
|
done
|
|
|
|
# Step 4: Wait for API server to respond on VIP
|
|
print_info -n "Step 4/6: Waiting for API server to respond on VIP $vip."
|
|
max_attempts=60
|
|
for attempt in $(seq 1 $max_attempts); do
|
|
if curl -k -s --max-time 5 "https://$vip:6443/healthz" >/dev/null 2>&1; then
|
|
echo ""
|
|
print_success "API server responding on VIP."
|
|
break
|
|
fi
|
|
if [ $attempt -eq $max_attempts ]; then
|
|
echo ""
|
|
print_error "API server not responding on VIP $vip after $max_attempts attempts."
|
|
print_info "Troubleshooting steps:"
|
|
print_info " 1. Check API server logs: talosctl -n $TARGET_IP logs kubelet | grep apiserver"
|
|
print_info " 2. Check if API server is running: talosctl -n $TARGET_IP containers -k | grep apiserver"
|
|
print_info " 3. Test API server on node IP: curl -k https://$TARGET_IP:6443/healthz"
|
|
exit 1
|
|
fi
|
|
# Attempt kubelet restart every 15 attempts to refresh certificates
|
|
if [ $((attempt % 15)) -eq 0 ]; then
|
|
echo ""
|
|
print_info "Restarting kubelet to refresh API container setup (attempt $attempt)..."
|
|
talosctl -n "$TARGET_IP" service kubelet restart > /dev/null 2>&1
|
|
print_info -n "Waiting for API server to respond after kubelet restart."
|
|
sleep 30 # Give kubelet time to restart
|
|
fi
|
|
printf "."
|
|
sleep 10
|
|
done
|
|
|
|
# Step 5: Configure talosctl endpoint and get kubeconfig
|
|
print_info "Step 5/6: Configuring cluster access..."
|
|
talosctl config endpoint "$vip"
|
|
|
|
if ! talosctl kubeconfig --nodes "$vip"; then
|
|
print_error "Failed to get kubeconfig via VIP."
|
|
print_info "Troubleshooting steps:"
|
|
print_info " 1. Check API server logs: talosctl -n $TARGET_IP logs kube-apiserver"
|
|
print_info " 2. Test API server on node IP: curl -k https://$TARGET_IP:6443/healthz"
|
|
print_info " 3. Verify network connectivity to VIP"
|
|
exit 1
|
|
else
|
|
print_success "Kubeconfig retrieved via VIP."
|
|
fi
|
|
|
|
|
|
# Step 6: Verify node registration
|
|
print_info -n "Step 6/6: Verifying node registration."
|
|
for reg_attempt in $(seq 1 10); do
|
|
if kubectl get nodes 2>/dev/null | grep -q "Ready\|NotReady"; then
|
|
echo ""
|
|
print_success "Node registered with API server."
|
|
break
|
|
fi
|
|
echo -n "."
|
|
sleep 10
|
|
done
|
|
|
|
if ! kubectl get nodes 2>/dev/null | grep -q "Ready\|NotReady"; then
|
|
echo ""
|
|
print_error "Node did not register with API server after multiple attempts."
|
|
print_info "Troubleshooting steps:"
|
|
print_info " 1. Check kubelet logs: talosctl -n $TARGET_IP logs kubelet"
|
|
print_info " 2. Check API server logs: talosctl -n $TARGET_IP logs kube-apiserver"
|
|
print_info " 3. Verify network connectivity between node and VIP"
|
|
exit 1
|
|
fi
|
|
|
|
print_success "Cluster bootstrap completed!"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# Worker node setup
|
|
echo ""
|
|
print_header "Worker Node Setup (Optional)"
|
|
WORKER_COUNT=1
|
|
while true; do
|
|
echo ""
|
|
read -p "Set up a worker node? (y/N): " -r setup_worker
|
|
|
|
if [[ $setup_worker =~ ^[Yy]$ ]]; then
|
|
# Find next available worker number
|
|
while wild-config --check "cluster.nodes.active.${HOSTNAME_PREFIX}worker-${WORKER_COUNT}.role" 2>/dev/null; do
|
|
WORKER_COUNT=$((WORKER_COUNT + 1))
|
|
done
|
|
|
|
NODE_NAME="${HOSTNAME_PREFIX}worker-${WORKER_COUNT}"
|
|
read -p "Enter IP address for worker node $NODE_NAME: " -r WORKER_IP
|
|
|
|
if [ -z "$WORKER_IP" ]; then
|
|
print_warning "No IP provided, skipping worker node"
|
|
continue
|
|
fi
|
|
|
|
# Pre-configure worker node
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker"
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP"
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
|
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
|
|
|
# Run complete node setup
|
|
if wild-node-setup "$NODE_NAME"; then
|
|
print_success "Worker node $NODE_NAME setup completed"
|
|
WORKER_COUNT=$((WORKER_COUNT + 1))
|
|
else
|
|
print_error "Failed to set up worker node $NODE_NAME"
|
|
print_info "You can retry later with: wild-node-setup $NODE_NAME"
|
|
fi
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
print_success "Node setup phase completed"
|
|
else
|
|
print_info "Skipping node setup (--skip-hardware specified)"
|
|
fi
|
|
|
|
# =============================================================================
|
|
# COMPLETION
|
|
# =============================================================================
|
|
|
|
print_header "Wild Cloud Cluster Setup Complete!"
|
|
|
|
print_success "Cluster infrastructure setup completed!"
|
|
echo ""
|
|
print_info "Next steps:"
|
|
echo " 1. Run 'wild-setup-services' to install cluster services"
|
|
echo " 2. Verify nodes are ready: kubectl get nodes"
|
|
echo " 3. Check cluster health: wild-health"
|
|
echo ""
|
|
print_info "Individual node management:"
|
|
echo " - Setup additional nodes: wild-node-setup <node-name>"
|
|
echo " - Re-detect hardware: wild-node-setup <node-name> --detect"
|
|
echo " - Configuration only: wild-node-setup <node-name> --no-deploy"
|
|
echo ""
|
|
|
|
print_success "Wild Cloud cluster setup completed!" |