From 838903e27d6ab3bdf3162f20c537df31fcca5c13 Mon Sep 17 00:00:00 2001 From: Paul Payne Date: Sun, 28 Sep 2025 15:25:00 -0700 Subject: [PATCH] Simplifies cluster service setup. --- bin/wild-cluster-services-configure | 124 ---------- bin/wild-cluster-services-fetch | 148 ----------- bin/wild-cluster-services-up | 180 -------------- bin/wild-service-setup | 201 +++++++++++++++ bin/wild-setup-cluster | 270 +++++++++++---------- bin/wild-setup-services | 64 +++-- scripts/install-wild-cloud-dependencies.sh | 9 + setup/README.md | 8 +- setup/cluster-services/README.md | 69 +++++- 9 files changed, 458 insertions(+), 615 deletions(-) delete mode 100755 bin/wild-cluster-services-configure delete mode 100755 bin/wild-cluster-services-fetch delete mode 100755 bin/wild-cluster-services-up create mode 100755 bin/wild-service-setup diff --git a/bin/wild-cluster-services-configure b/bin/wild-cluster-services-configure deleted file mode 100755 index c850eb7..0000000 --- a/bin/wild-cluster-services-configure +++ /dev/null @@ -1,124 +0,0 @@ -#\!/bin/bash - -set -e -set -o pipefail - -# Usage function -usage() { - echo "Usage: wild-cluster-services-configure [options] [service...]" - echo "" - echo "Compile service templates with configuration" - echo "" - echo "Arguments:" - echo " service Specific service(s) to compile (optional)" - echo "" - echo "Options:" - echo " -h, --help Show this help message" - echo "" - echo "Examples:" - echo " wild-cluster-services-configure # Compile all services" - echo " wild-cluster-services-configure metallb traefik # Compile specific services" - echo "" - echo "Available services:" - echo " metallb, longhorn, traefik, coredns, cert-manager," - echo " externaldns, kubernetes-dashboard, nfs, docker-registry" -} - -# Parse arguments -DRY_RUN=false -LIST_SERVICES=false -SPECIFIC_SERVICES=() - -while [[ $# -gt 0 ]]; do - case $1 in - -h|--help) - usage - exit 0 - ;; - --dry-run) - DRY_RUN=true - shift - ;; - -*) - echo "Unknown option $1" - usage - exit 1 - ;; - *) - SPECIFIC_SERVICES+=("$1") - shift - ;; - esac -done - -# Initialize Wild Cloud environment -if [ -z "${WC_ROOT}" ]; then - print "WC_ROOT is not set." - exit 1 -else - source "${WC_ROOT}/scripts/common.sh" - init_wild_env -fi - -CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services" - -# Check if cluster setup directory exists -if [ ! -d "$CLUSTER_SETUP_DIR" ]; then - print_error "Cluster services setup directory not found: $CLUSTER_SETUP_DIR" - print_info "Run 'wild-cluster-services-generate' first to generate setup files" - exit 1 -fi - -# ============================================================================= -# CLUSTER SERVICES TEMPLATE COMPILATION -# ============================================================================= - -print_header "Cluster services template compilation" - -# Get list of services to compile -if [ ${#SPECIFIC_SERVICES[@]} -gt 0 ]; then - SERVICES_TO_INSTALL=("${SPECIFIC_SERVICES[@]}") - print_info "Compiling specific services: ${SERVICES_TO_INSTALL[*]}" -else - # Compile all available services in a specific order for dependencies - SERVICES_TO_INSTALL=( - "metallb" - "longhorn" - "traefik" - "coredns" - "cert-manager" - "externaldns" - "kubernetes-dashboard" - "nfs" - "docker-registry" - ) - print_info "Installing all available services" -fi - -print_info "Services to compile: ${SERVICES_TO_INSTALL[*]}" - -# Compile services -cd "$CLUSTER_SETUP_DIR" -INSTALLED_COUNT=0 -FAILED_COUNT=0 - -for service in "${SERVICES_TO_INSTALL[@]}"; do - print_info "Compiling $service" - - service_dir="$CLUSTER_SETUP_DIR/$service" - source_service_dir="$service_dir/kustomize.template" - dest_service_dir="$service_dir/kustomize" - - # Run configuration to make sure we have the template values we need. - config_script="$service_dir/configure.sh" - if [ -f "$config_script" ]; then - source "$config_script" - fi - - wild-compile-template-dir --clean "$source_service_dir" "$dest_service_dir" - echo "" -done - -cd - >/dev/null - -print_success "Successfully compiled: $INSTALLED_COUNT services" diff --git a/bin/wild-cluster-services-fetch b/bin/wild-cluster-services-fetch deleted file mode 100755 index 95a1d44..0000000 --- a/bin/wild-cluster-services-fetch +++ /dev/null @@ -1,148 +0,0 @@ -#\!/bin/bash - -set -e -set -o pipefail - -# Usage function -usage() { - echo "Usage: wild-cluster-services-fetch [options]" - echo "" - echo "Fetch cluster services setup files from the repository." - echo "" - echo "Arguments:" - echo " service Specific service(s) to install (optional)" - echo "" - echo "Options:" - echo " -h, --help Show this help message" - echo " --force Force fetching even if files exist" - echo "" - echo "Examples:" - echo " wild-cluster-services-fetch # Fetch all services" - echo " wild-cluster-services-fetch metallb traefik # Fetch specific services" - echo "" - echo "Available services:" - echo " metallb, longhorn, traefik, coredns, cert-manager," - echo " externaldns, kubernetes-dashboard, nfs, docker-registry" -} - -# Parse arguments -FORCE=false -while [[ $# -gt 0 ]]; do - case $1 in - -h|--help) - usage - exit 0 - ;; - --force) - FORCE=true - shift - ;; - -*) - echo "Unknown option $1" - usage - exit 1 - ;; - *) - echo "Unexpected argument: $1" - usage - exit 1 - ;; - esac -done - -# Initialize Wild Cloud environment -if [ -z "${WC_ROOT}" ]; then - print "WC_ROOT is not set." - exit 1 -else - source "${WC_ROOT}/scripts/common.sh" - init_wild_env -fi - -print_header "Fetching cluster services templates" - -SOURCE_DIR="${WC_ROOT}/setup/cluster-services" -DEST_DIR="${WC_HOME}/setup/cluster-services" - -# Check if source directory exists -if [ ! -d "$SOURCE_DIR" ]; then - print_error "Cluster setup source directory not found: $SOURCE_DIR" - print_info "Make sure the wild-cloud repository is properly set up" - exit 1 -fi - -# Check if destination already exists -if [ -d "$DEST_DIR" ] && [ "$FORCE" = false ]; then - print_warning "Cluster setup directory already exists: $DEST_DIR" - read -p "Overwrite existing files? (y/N): " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - FORCE=true - fi -else - mkdir -p "$DEST_DIR" -fi - -# Copy README -if [ ! -f "${WC_HOME}/setup/README.md" ]; then - cp "${WC_ROOT}/setup/README.md" "${WC_HOME}/setup/README.md" -fi - -# Get list of services to install -if [ ${#SPECIFIC_SERVICES[@]} -gt 0 ]; then - SERVICES_TO_INSTALL=("${SPECIFIC_SERVICES[@]}") - print_info "Fetching specific services: ${SERVICES_TO_INSTALL[*]}" -else - # Install all available services in a specific order for dependencies - SERVICES_TO_INSTALL=( - "metallb" - "longhorn" - "traefik" - "coredns" - "cert-manager" - "externaldns" - "kubernetes-dashboard" - "nfs" - "docker-registry" - ) - print_info "Fetching all available services." -fi - -for service in "${SERVICES_TO_INSTALL[@]}"; do - - SERVICE_SOURCE_DIR="$SOURCE_DIR/$service" - SERVICE_DEST_DIR="$DEST_DIR/$service" - TEMPLATE_SOURCE_DIR="$SERVICE_SOURCE_DIR/kustomize.template" - TEMPLATE_DEST_DIR="$SERVICE_DEST_DIR/kustomize.template" - - if [ ! -d "$TEMPLATE_SOURCE_DIR" ]; then - print_error "Source directory not found: $TEMPLATE_SOURCE_DIR" - continue - fi - - if $FORCE && [ -d "$TEMPLATE_DEST_DIR" ]; then - print_info "Removing existing $service templates in: $TEMPLATE_DEST_DIR" - rm -rf "$TEMPLATE_DEST_DIR" - elif [ -d "$TEMPLATE_DEST_DIR" ]; then - print_info "Files already exist for $service, skipping (use --force to overwrite)." - continue - fi - - mkdir -p "$SERVICE_DEST_DIR" - mkdir -p "$TEMPLATE_DEST_DIR" - cp -f "$SERVICE_SOURCE_DIR/README.md" "$SERVICE_DEST_DIR/" - - if [ -f "$SERVICE_SOURCE_DIR/configure.sh" ]; then - cp -f "$SERVICE_SOURCE_DIR/configure.sh" "$SERVICE_DEST_DIR/" - fi - - if [ -f "$SERVICE_SOURCE_DIR/install.sh" ]; then - cp -f "$SERVICE_SOURCE_DIR/install.sh" "$SERVICE_DEST_DIR/" - fi - - if [ -d "$TEMPLATE_SOURCE_DIR" ]; then - cp -r "$TEMPLATE_SOURCE_DIR/"* "$TEMPLATE_DEST_DIR/" - fi - - print_success "Fetched $service templates." -done diff --git a/bin/wild-cluster-services-up b/bin/wild-cluster-services-up deleted file mode 100755 index 419f627..0000000 --- a/bin/wild-cluster-services-up +++ /dev/null @@ -1,180 +0,0 @@ -#\!/bin/bash - -set -e -set -o pipefail - -# Usage function -usage() { - echo "Usage: wild-cluster-services-up [options] [service...]" - echo "" - echo "Install cluster services from generated setup files." - echo "" - echo "Arguments:" - echo " service Specific service(s) to install (optional)" - echo "" - echo "Options:" - echo " -h, --help Show this help message" - echo " --dry-run Show what would be installed without running" - echo "" - echo "Examples:" - echo " wild-cluster-services-up # Install all services" - echo " wild-cluster-services-up metallb traefik # Install specific services" - echo "" - echo "Available services:" - echo " metallb, longhorn, traefik, coredns, cert-manager," - echo " externaldns, kubernetes-dashboard, nfs, docker-registry" -} - -# Parse arguments -DRY_RUN=false -LIST_SERVICES=false -SPECIFIC_SERVICES=() - -while [[ $# -gt 0 ]]; do - case $1 in - -h|--help) - usage - exit 0 - ;; - --dry-run) - DRY_RUN=true - shift - ;; - -*) - echo "Unknown option $1" - usage - exit 1 - ;; - *) - SPECIFIC_SERVICES+=("$1") - shift - ;; - esac -done - -# Initialize Wild Cloud environment -if [ -z "${WC_ROOT}" ]; then - print "WC_ROOT is not set." - exit 1 -else - source "${WC_ROOT}/scripts/common.sh" - init_wild_env -fi - -CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services" - -# Check if cluster setup directory exists -if [ ! -d "$CLUSTER_SETUP_DIR" ]; then - print_error "Cluster services setup directory not found: $CLUSTER_SETUP_DIR" - print_info "Run 'wild-cluster-services-generate' first to generate setup files" - exit 1 -fi - -# ============================================================================= -# CLUSTER SERVICES INSTALLATION -# ============================================================================= - -print_header "Cluster services installation" - -# Check kubectl connectivity -if [ "$DRY_RUN" = false ]; then - print_info "Checking Kubernetes cluster connectivity..." - if ! kubectl cluster-info >/dev/null 2>&1; then - print_error "kubectl is not configured or cluster is not accessible" - print_info "Make sure your cluster is running and kubeconfig is set up" - print_info "You can get kubeconfig with: talosctl kubeconfig" - exit 1 - fi - print_success "Cluster is accessible" -fi - -# Get list of services to install -if [ ${#SPECIFIC_SERVICES[@]} -gt 0 ]; then - SERVICES_TO_INSTALL=("${SPECIFIC_SERVICES[@]}") - print_info "Installing specific services: ${SERVICES_TO_INSTALL[*]}" -else - # Install all available services in a specific order for dependencies - SERVICES_TO_INSTALL=( - "metallb" - "longhorn" - "traefik" - "coredns" - "cert-manager" - "externaldns" - "kubernetes-dashboard" - "nfs" - "docker-registry" - ) - print_info "Installing all available services" -fi - -print_info "Services to install: ${SERVICES_TO_INSTALL[*]}" - -if [ "$DRY_RUN" = true ]; then - print_info "DRY RUN - would install the following services:" - for service in "${SERVICES_TO_INSTALL[@]}"; do - print_info " - $service: $CLUSTER_SETUP_DIR/$service/install.sh" - done - exit 0 -fi - -# Install services -cd "$CLUSTER_SETUP_DIR" -INSTALLED_COUNT=0 -FAILED_COUNT=0 - -SOURCE_DIR="${WC_ROOT}/setup/cluster-services" - -for service in "${SERVICES_TO_INSTALL[@]}"; do - echo "" - print_header "Installing $service" - - if [ -f "./$service/install.sh" ]; then - print_info "Running $service installation..." - if ./"$service"/install.sh; then - print_success "$service installed successfully" - INSTALLED_COUNT=$((INSTALLED_COUNT + 1)) - else - print_error "$service installation failed" - FAILED_COUNT=$((FAILED_COUNT + 1)) - fi - else - print_warning "$service install script not found" - FAILED_COUNT=$((FAILED_COUNT + 1)) - fi -done - -cd - >/dev/null - -# Summary -echo "" -print_header "Installation summary" -print_success "Successfully installed: $INSTALLED_COUNT services" -if [ $FAILED_COUNT -gt 0 ]; then - print_warning "Failed to install: $FAILED_COUNT services" -fi - -if [ $INSTALLED_COUNT -gt 0 ]; then - echo "" - print_info "Next steps:" - echo " 1. Verify installations with: kubectl get pods --all-namespaces" - echo " 2. Check service status with: kubectl get services --all-namespaces" - - # Service-specific next steps - if [[ " ${SERVICES_TO_INSTALL[*]} " =~ " kubernetes-dashboard " ]]; then - INTERNAL_DOMAIN=$(wild-config cloud.internalDomain 2>/dev/null || echo "your-internal-domain") - echo " 3. Access dashboard at: https://dashboard.${INTERNAL_DOMAIN}" - echo " 4. Get dashboard token with: ${WC_ROOT}/bin/dashboard-token" - fi - - if [[ " ${SERVICES_TO_INSTALL[*]} " =~ " cert-manager " ]]; then - echo " 3. Check cert-manager: kubectl get clusterissuers" - fi -fi - -if [ $FAILED_COUNT -eq 0 ]; then - print_success "All cluster services installed successfully!" -else - print_warning "Some services failed to install. Check the output above for details." - exit 1 -fi \ No newline at end of file diff --git a/bin/wild-service-setup b/bin/wild-service-setup new file mode 100755 index 0000000..3ef4e56 --- /dev/null +++ b/bin/wild-service-setup @@ -0,0 +1,201 @@ +#!/bin/bash + +set -e +set -o pipefail + +# Usage function +usage() { + echo "Usage: wild-service-setup [options]" + echo "" + echo "Set up a single cluster service with complete lifecycle management." + echo "" + echo "Arguments:" + echo " service Service name to set up" + echo "" + echo "Options:" + echo " --fetch Fetch fresh templates from repository before setup" + echo " --no-deploy Configure only, skip deployment to cluster" + echo " -h, --help Show this help message" + echo "" + echo "Examples:" + echo " wild-service-setup cert-manager # Configure and deploy (most common)" + echo " wild-service-setup cert-manager --fetch # Fetch fresh templates, configure, and deploy" + echo " wild-service-setup cert-manager --no-deploy # Configure only, skip deployment" + echo " wild-service-setup cert-manager --fetch --no-deploy # Fetch and configure, but don't deploy" + echo "" + echo "Available services:" + echo " metallb, longhorn, traefik, coredns, cert-manager," + echo " externaldns, kubernetes-dashboard, nfs, docker-registry" +} + +# Parse arguments +FETCH=false +NO_DEPLOY=false +SERVICE="" + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage + exit 0 + ;; + --fetch) + FETCH=true + shift + ;; + --no-deploy) + NO_DEPLOY=true + shift + ;; + -*) + echo "Unknown option $1" + usage + exit 1 + ;; + *) + if [ -z "$SERVICE" ]; then + SERVICE="$1" + else + echo "Unexpected argument: $1" + usage + exit 1 + fi + shift + ;; + esac +done + +# Validate required service argument +if [ -z "$SERVICE" ]; then + echo "Error: Service name is required" + usage + exit 1 +fi + +# Initialize Wild Cloud environment +if [ -z "${WC_ROOT}" ]; then + echo "WC_ROOT is not set." + exit 1 +else + source "${WC_ROOT}/scripts/common.sh" + init_wild_env +fi + +print_header "Setting up service: $SERVICE" + +# ============================================================================= +# FETCH FUNCTION +# ============================================================================= + +fetch_service_templates() { + local reason="$1" + print_info "$reason" + + local source_dir="${WC_ROOT}/setup/cluster-services" + local dest_dir="${WC_HOME}/setup/cluster-services" + local service_source_dir="$source_dir/$SERVICE" + local service_dest_dir="$dest_dir/$SERVICE" + local template_source_dir="$service_source_dir/kustomize.template" + local template_dest_dir="$service_dest_dir/kustomize.template" + + # Check if source service exists + if [ ! -d "$service_source_dir" ]; then + print_error "Service '$SERVICE' not found in repository: $service_source_dir" + print_info "Available services:" + ls -1 "$source_dir" | grep -v README | tr '\n' ' ' + echo + exit 1 + fi + + # Create destination directories + mkdir -p "$service_dest_dir" + mkdir -p "$template_dest_dir" + + # Copy service files + cp -f "$service_source_dir/README.md" "$service_dest_dir/" 2>/dev/null || true + + if [ -f "$service_source_dir/configure.sh" ]; then + cp -f "$service_source_dir/configure.sh" "$service_dest_dir/" + fi + + if [ -f "$service_source_dir/install.sh" ]; then + cp -f "$service_source_dir/install.sh" "$service_dest_dir/" + fi + + if [ -d "$template_source_dir" ]; then + cp -r "$template_source_dir/"* "$template_dest_dir/" + fi + + print_success "Fetched templates for $SERVICE" +} + +# ============================================================================= +# FETCH PHASE (Optional) +# ============================================================================= + +if [ "$FETCH" = true ]; then + fetch_service_templates "Fetching fresh templates for $SERVICE..." +fi + +# ============================================================================= +# CONFIGURE PHASE (Always runs) +# ============================================================================= + +print_info "Configuring $SERVICE..." + +CLUSTER_SETUP_DIR="${WC_HOME}/setup/cluster-services" +SERVICE_DIR="$CLUSTER_SETUP_DIR/$SERVICE" + +# Check if service directory exists, fetch if missing +if [ ! -d "$SERVICE_DIR" ]; then + fetch_service_templates "Service directory not found, fetching templates automatically..." +fi + +# Run service configuration script +CONFIG_SCRIPT="$SERVICE_DIR/configure.sh" +if [ -f "$CONFIG_SCRIPT" ]; then + print_info "Running configuration for $SERVICE..." + source "$CONFIG_SCRIPT" +else + print_info "No configuration script found for $SERVICE, skipping configuration prompts" +fi + +# Compile templates +SOURCE_TEMPLATE_DIR="$SERVICE_DIR/kustomize.template" +DEST_TEMPLATE_DIR="$SERVICE_DIR/kustomize" + +if [ -d "$SOURCE_TEMPLATE_DIR" ]; then + print_info "Compiling templates for $SERVICE..." + wild-compile-template-dir --clean "$SOURCE_TEMPLATE_DIR" "$DEST_TEMPLATE_DIR" + print_success "Templates compiled for $SERVICE" +else + print_warning "No templates found for $SERVICE at $SOURCE_TEMPLATE_DIR" +fi + +# ============================================================================= +# DEPLOY PHASE (Optional) +# ============================================================================= + +if [ "$NO_DEPLOY" = true ]; then + print_info "Skipping deployment for $SERVICE (--no-deploy specified)" + print_success "Configuration complete for $SERVICE" + print_info "To deploy later, run: wild-service-setup $SERVICE" +else + print_info "Deploying $SERVICE to cluster..." + + # Run service installation script + INSTALL_SCRIPT="$SERVICE_DIR/install.sh" + if [ -f "$INSTALL_SCRIPT" ]; then + if "$INSTALL_SCRIPT"; then + print_success "$SERVICE deployed successfully" + else + print_error "$SERVICE deployment failed" + exit 1 + fi + else + print_error "No installation script found for $SERVICE at $INSTALL_SCRIPT" + exit 1 + fi +fi + +print_success "Service setup complete: $SERVICE" \ No newline at end of file diff --git a/bin/wild-setup-cluster b/bin/wild-setup-cluster index 1c4cb4a..f5f6854 100755 --- a/bin/wild-setup-cluster +++ b/bin/wild-setup-cluster @@ -61,12 +61,15 @@ else init_wild_env fi + print_header "Wild Cloud Cluster Setup" # ============================================================================= # Configuration # ============================================================================= +print_header "Configuration" + prompt_if_unset_config "operator.email" "Operator email address" prompt_if_unset_config "cluster.name" "Cluster name" "wild-cluster" @@ -134,59 +137,39 @@ fi if [ "${SKIP_HARDWARE}" = false ]; then - print_header "Control Plane Configuration" + print_header "Control node registration" # Automatically configure the first three IPs after VIP for control plane nodes vip_last_octet=$(echo "$vip" | cut -d. -f4) vip_prefix=$(echo "$vip" | cut -d. -f1-3) - # Detect and register control plane nodes - print_header "Control Plane Node Registration" - # Process each control plane node for i in 1 2 3; do NODE_NAME="${HOSTNAME_PREFIX}control-${i}" TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))" - print_info "Registering control plane node: $NODE_NAME (IP: $TARGET_IP)" + print_info "Checking for control plane node: $NODE_NAME (IP: $TARGET_IP)" - # Initialize the node in cluster.nodes.active if not already present - if [ -z "$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".role")" ]; then - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane" - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP" - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$TARGET_IP" - fi - - # Check if node is already configured - existing_interface=$(wild-config "cluster.nodes.active.\"${NODE_NAME}\".interface") - if [ -n "$existing_interface" ] && [ "$existing_interface" != "null" ]; then - print_success "Node $NODE_NAME already configured" - print_info " - Interface: $existing_interface" - print_info " - Disk: $(wild-config "cluster.nodes.active.\"${NODE_NAME}\".disk")" - - # Generate machine config patch for this node if necessary. - NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes" - CONFIG_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml" - if [ ! -f "$CONFIG_FILE" ]; then - print_info "Generating missing machine configuration patch for $NODE_NAME..." - if wild-cluster-node-patch-generate "$NODE_NAME"; then - print_success "Machine configuration patch generated for $NODE_NAME" - else - print_warning "Failed to generate machine configuration patch for $NODE_NAME" - fi - else - print_info " ✓ Machine configuration patch exists: $CONFIG_FILE" - fi + if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then + print_success "Node $NODE_NAME already registered." continue fi - read -p "Do you want to bring up control plane node $NODE_NAME ($TARGET_IP) now? (y/N): " -r register_node - if [[ ! $register_node =~ ^[Yy]$ ]]; then + if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.role"; then + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane" + fi + + if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.targetIp"; then + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP" + fi + + print_info "${NODE_NAME} not found. Please ensure the node is powered on and running Talos in maintenance mode." + read -p "Is $NODE_NAME in maintenance mode now? (Y/n): " -r register_node + if [[ $register_node =~ ^[Nn]$ ]]; then print_info "Skipping bringing up node $NODE_NAME registration" continue fi - # Register node in config.yaml. - # First try to detect at target IP. + # Detect node hardware print_info "Attempting detection at target IP $TARGET_IP..." DETECTION_IP="$TARGET_IP" NODE_INFO="" @@ -195,115 +178,147 @@ if [ "${SKIP_HARDWARE}" = false ]; then NODE_INFO=$(wild-node-detect "$TARGET_IP") print_success "Node detected at target IP $TARGET_IP" else - # Fall back to maintenance IP + # Fall back to current IP print_warning "Node not accessible at target IP $TARGET_IP" - read -p "Enter maintenance IP for this node: " -r MAINTENANCE_IP - - if [ -z "$MAINTENANCE_IP" ]; then + read -p "Enter current IP for this node: " -r CURRENT_IP + + if [ -z "$CURRENT_IP" ]; then print_warning "Skipping node $NODE_NAME registration" continue fi - - print_info "Attempting detection at maintenance IP $MAINTENANCE_IP..." - if wild-node-detect "$MAINTENANCE_IP" >/dev/null 2>&1; then - NODE_INFO=$(wild-node-detect "$MAINTENANCE_IP") - DETECTION_IP="$MAINTENANCE_IP" - - # Store maintenance IP for reference - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".maintenanceIp" "$MAINTENANCE_IP" - print_success "Node detected at maintenance IP $MAINTENANCE_IP" + + print_info "Attempting detection at current IP $CURRENT_IP..." + if wild-node-detect "$CURRENT_IP" >/dev/null 2>&1; then + NODE_INFO=$(wild-node-detect "$CURRENT_IP") + DETECTION_IP="$CURRENT_IP" + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$CURRENT_IP" + print_success "Node detected at current IP $CURRENT_IP" else - print_error "Failed to detect node at $MAINTENANCE_IP" + print_error "Failed to detect node at $CURRENT_IP" continue fi fi - - if [ -n "$NODE_INFO" ]; then - # Parse JSON response - INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface') - SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk') - AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -) - - print_success "Hardware detected:" - print_info " - Interface: $INTERFACE" - print_info " - Available disks: $AVAILABLE_DISKS" - print_info " - Selected disk: $SELECTED_DISK" - - # Allow user to override disk selection - echo "" - read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk - if [[ $use_disk =~ ^[Nn]$ ]]; then - echo "Available disks:" - echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') ' - read -p "Enter disk number: " -r disk_num - SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path") - if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then - print_error "Invalid disk selection" - continue - fi - print_info "Selected disk: $SELECTED_DISK" - fi - - # Update config.yaml with hardware info. - print_info "Updating configuration for $NODE_NAME..." - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE" - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK" - - # Copy current Talos version and schematic ID to this node - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version" - wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id" - - echo "" - read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (y/N): " -r apply_config - if [[ $apply_config =~ ^[Yy]$ ]]; then - if [ "$DETECTION_IP" != "$TARGET_IP" ]; then - # Node is in maintenance mode, use insecure flag - print_info "Applying configuration in insecure mode (maintenance mode)..." - wild-cluster-node-up "$NODE_NAME" --insecure - else - # Node is already configured, use secure mode - print_info "Applying configuration..." - wild-cluster-node-up "$NODE_NAME" - fi - # Bootstrap the cluster after the first node is up. - if [ "$i" -eq 1 ]; then - read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready?: " -r is_ready - if [[ $is_ready =~ ^[Yy]$ ]]; then - print_info "Bootstrapping control plane node $TARGET_IP..." - talosctl config endpoint "$TARGET_IP" - - # Attempt to bootstrap the cluster - if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then - print_success "Control plane node $TARGET_IP bootstrapped successfully!" + if ! [ -n "$NODE_INFO" ]; then + print_error "No hardware information received from node" + continue + fi + + INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface') + SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk') + AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -) + + print_success "Hardware detected:" + print_info " - Interface: $INTERFACE" + print_info " - Available disks: $AVAILABLE_DISKS" + print_info " - Selected disk: $SELECTED_DISK" + + # User system disk selection + echo "" + read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk + if [[ $use_disk =~ ^[Nn]$ ]]; then + echo "Available disks:" + echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') ' + read -p "Enter disk number: " -r disk_num + SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path") + if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then + print_error "Invalid disk selection" + continue + fi + print_info "Selected disk: $SELECTED_DISK" + fi + + # Update config.yaml with hardware info. + print_info "Updating configuration for $NODE_NAME..." + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE" + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK" + + # Copy current Talos version and schematic ID to this node + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version" + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id" + + # The node is now configured. Bring it up. + echo "" + read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (Y/n): " -r apply_config + if [[ ! $apply_config =~ ^[Nn]$ ]]; then + if [ "$DETECTION_IP" != "$TARGET_IP" ]; then + # Node is in maintenance mode, use insecure flag + print_info "Applying configuration in insecure mode (maintenance mode)..." + wild-cluster-node-up "$NODE_NAME" --insecure + else + # Node is already up, no insecure flag needed + print_info "Applying configuration..." + wild-cluster-node-up "$NODE_NAME" --force + fi + + # Bootstrap the cluster after the first node is up. + if [ "$i" -eq 1 ]; then + read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready? (Y/n): " -r is_ready + if [[ ! $is_ready =~ ^[Nn]$ ]]; then + print_info "Bootstrapping control plane node $TARGET_IP..." + talosctl config endpoint "$TARGET_IP" + + # Attempt to bootstrap the cluster + if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then + print_success "Control plane node $TARGET_IP bootstrapped successfully!" + else + # Check if the error is because it's already bootstrapped + if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then + print_info "Cluster is already bootstrapped on $TARGET_IP" else - # Check if the error is because it's already bootstrapped - if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then - print_info "Cluster is already bootstrapped on $TARGET_IP" - else - print_error "Failed to bootstrap control plane node $TARGET_IP" - print_info "Bootstrap output:" - cat /tmp/bootstrap_output.log - rm -f /tmp/bootstrap_output.log - continue + print_error "Failed to bootstrap control plane node $TARGET_IP" + print_info "Bootstrap output:" + cat /tmp/bootstrap_output.log + rm -f /tmp/bootstrap_output.log + continue + fi + fi + rm -f /tmp/bootstrap_output.log + + # Wait for VIP to become available before using it + print_info "Waiting for VIP $vip to become available..." + max_attempts=30 + attempt=1 + vip_ready=false + + while [ $attempt -le $max_attempts ]; do + if ping -c 1 -W 2 "$vip" >/dev/null 2>&1; then + # VIP responds to ping, now test Talos API + if talosctl -e "$vip" -n "$vip" version >/dev/null 2>&1; then + print_success "VIP $vip is ready (attempt $attempt/$max_attempts)" + vip_ready=true + break fi fi - rm -f /tmp/bootstrap_output.log + print_info "VIP not ready, waiting... (attempt $attempt/$max_attempts)" + sleep 2 + attempt=$((attempt + 1)) + done + if [ "$vip_ready" = true ]; then talosctl config endpoint "$vip" print_info "Talos endpoint set to control plane VIP: $vip" - talosctl kubeconfig "$vip" - print_success "Talos kubeconfig updated for control plane VIP: $vip" + if talosctl kubeconfig "$vip"; then + print_success "Talos kubeconfig updated for control plane VIP: $vip" + else + print_error "Failed to get kubeconfig from VIP: $vip" + print_info "You can try again later with: talosctl kubeconfig $vip" + fi + else + print_error "VIP $vip did not become available after $max_attempts attempts" + print_warning "Falling back to direct node access" + print_info "Talos endpoint remains set to: $TARGET_IP" + print_info "You can try switching to VIP later with: talosctl config endpoint $vip" fi fi - - else - print_info "Configuration not applied. You can apply it later with:" - print_info " wild-cluster-node-up $NODE_NAME --insecure" fi - + + else + print_info "Configuration not applied. You can apply it later with:" + print_info " wild-cluster-node-up $NODE_NAME --insecure" fi + done # Register worker nodes @@ -377,6 +392,7 @@ if [ "${SKIP_HARDWARE}" = false ]; then # Store under unified cluster.nodes.active. wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$WORKER_IP" + wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE" wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK" @@ -397,8 +413,8 @@ if [ "${SKIP_HARDWARE}" = false ]; then # Ask if user wants to apply the configuration now echo "" - read -p "Apply configuration to worker node $NODE_NAME now? (y/N): " -r apply_config - if [[ $apply_config =~ ^[Yy]$ ]]; then + read -p "Apply configuration to worker node $NODE_NAME now? (Y/n): " -r apply_config + if [[ $apply_config =~ ^[Yy]$ ]] || [[ -z "$apply_config" ]]; then # Worker nodes are typically in maintenance mode during setup print_info "Applying configuration in insecure mode (maintenance mode)..." wild-cluster-node-up "$NODE_NAME" --insecure diff --git a/bin/wild-setup-services b/bin/wild-setup-services index dd6a1ef..7e1775e 100755 --- a/bin/wild-setup-services +++ b/bin/wild-setup-services @@ -28,7 +28,6 @@ while [[ $# -gt 0 ]]; do echo " - Each service will prompt for its required configuration" echo "" echo "Prerequisites:" - echo " - Run 'wild-setup-scaffold' to initialize the cloud" echo " - Run 'wild-setup-cluster' to set up cluster infrastructure" echo " - Kubernetes cluster must be running and kubectl configured" exit 0 @@ -67,34 +66,57 @@ fi print_header "Wild Cloud services setup" -if ! command -v kubectl >/dev/null 2>&1; then - print_error "kubectl is not installed or not in PATH" - print_info "Please install kubectl and configure it to connect to your cluster" - exit 1 -fi +# Define services in dependency order +SERVICES_TO_INSTALL=( + "metallb" + "longhorn" + "traefik" + "coredns" + "cert-manager" + "externaldns" + "kubernetes-dashboard" + "nfs" + "docker-registry" +) -if ! kubectl cluster-info >/dev/null 2>&1; then - print_error "kubectl is not configured to connect to your cluster" - print_info "Please configure kubectl to connect to your Kubernetes cluster" - exit 1 -fi - -# Generate cluster services setup files -wild-cluster-services-fetch -wild-cluster-services-generate - -# Apply cluster services to cluster +# Set up services one by one +INSTALLED_COUNT=0 +FAILED_COUNT=0 if [ "${SKIP_INSTALL}" = false ]; then - wild-cluster-services-up - SERVICES_INSTALLED=true + for service in "${SERVICES_TO_INSTALL[@]}"; do + echo "" + print_header "Setting up service: $service" + + if wild-service-setup "$service" --fetch; then + print_success "$service setup completed" + INSTALLED_COUNT=$((INSTALLED_COUNT + 1)) + else + print_error "$service setup failed" + FAILED_COUNT=$((FAILED_COUNT + 1)) + # Stop on first failure for easier debugging + break + fi + done + + if [ $FAILED_COUNT -eq 0 ]; then + SERVICES_INSTALLED=true + print_success "All $INSTALLED_COUNT services set up successfully!" + else + print_error "Service setup stopped after $service failure" + print_info "Fix the issue and resume with: wild-service-setup $service --fetch" + print_info "Then continue with remaining services or re-run wild-setup-services" + exit 1 + fi else print_info "Skipping cluster services installation (--skip-install specified)" - print_info "You can install them later with: wild-cluster-services-up" + print_info "You can install them later with:" + for service in "${SERVICES_TO_INSTALL[@]}"; do + print_info " wild-service-setup $service --fetch" + done fi # Summary output - print_header "Wild Cloud Services Setup Complete!" echo "" diff --git a/scripts/install-wild-cloud-dependencies.sh b/scripts/install-wild-cloud-dependencies.sh index 890d166..37b877e 100755 --- a/scripts/install-wild-cloud-dependencies.sh +++ b/scripts/install-wild-cloud-dependencies.sh @@ -48,3 +48,12 @@ else sudo apt-get install -y restic echo "restic installed successfully." fi + +## Install direnv +if command -v direnv &> /dev/null; then + echo "direnv is already installed." +else + sudo apt-get update + sudo apt-get install -y direnv + echo "direnv installed successfully. Add `eval \"\$(direnv hook bash)\"` to your shell configuration file if not already present." +fi diff --git a/setup/README.md b/setup/README.md index 998df52..490ad02 100644 --- a/setup/README.md +++ b/setup/README.md @@ -6,13 +6,7 @@ Follow the instructions to [set up a dnsmasq machine](./dnsmasq/README.md). Follow the instructions to [set up cluster nodes](./cluster-nodes/README.md). -Set up cluster services: - -```bash -wild-cluster-services-fetch -wild-cluster-services-configure -wild-cluster-services-up -``` +Follow the instruction to set up [cluster services](./cluster-services/README.md). Now make sure everything works: diff --git a/setup/cluster-services/README.md b/setup/cluster-services/README.md index 2d0f6a7..e40b44e 100644 --- a/setup/cluster-services/README.md +++ b/setup/cluster-services/README.md @@ -1,4 +1,4 @@ -# Infrastructure setup scripts +# Wild Cloud Cluster Services Creates a fully functional personal cloud infrastructure on a bare metal Kubernetes cluster that provides: @@ -7,6 +7,20 @@ Creates a fully functional personal cloud infrastructure on a bare metal Kuberne 3. **Secure traffic routing** with automatic TLS 4. **Reliable networking** with proper load balancing +## Service Management + +Wild Cloud uses a streamlined per-service setup approach: + +**Primary Command**: `wild-service-setup [options]` +- **Default**: Configure and deploy service using existing templates +- **`--fetch`**: Fetch fresh templates before setup (for updates) +- **`--no-deploy`**: Configure only, skip deployment (for planning) + +**Master Orchestrator**: `wild-setup-services` +- Sets up all services in proper dependency order +- Each service validates its prerequisites before deployment +- Fail-fast approach with clear recovery instructions + ## Architecture ``` @@ -30,14 +44,53 @@ Internet → External DNS → MetalLB LoadBalancer → Traefik → Kubernetes Se - **[Docker Registry](docker-registry/README.md)** - Private container registry for custom images - **[Utils](utils/README.md)** - Cluster utilities and debugging tools +## Common Usage Patterns + +### Complete Infrastructure Setup +```bash +# All services with fresh templates (recommended for first-time setup) +wild-setup-services +``` + +### Individual Service Management +```bash +# Most common - reconfigure and deploy existing service +wild-service-setup cert-manager + +# Get fresh templates and deploy (for updates) +wild-service-setup cert-manager --fetch + +# Configure only, don't deploy (for planning) +wild-service-setup cert-manager --no-deploy + +# Fresh templates + configure + deploy +wild-service-setup cert-manager --fetch +``` + +### Service Dependencies +Services are automatically deployed in dependency order: +1. **metallb** → Load balancing foundation +2. **traefik** → Ingress (requires metallb) +3. **cert-manager** → TLS certificates (requires traefik) +4. **externaldns** → DNS automation (requires cert-manager) +5. **kubernetes-dashboard** → Admin UI (requires cert-manager) + +Each service validates its dependencies before deployment. + ## Idempotent Design -All setup scripts are designed to be idempotent: +All setup is designed to be idempotent and reliable: -- Scripts can be run multiple times without causing harm -- Each script checks for existing resources before creating new ones -- Configuration updates are applied cleanly without duplication -- Failed or interrupted setups can be safely retried -- Changes to configuration will be properly applied on subsequent runs +- **Atomic Operations**: Each service handles its complete lifecycle +- **Dependency Validation**: Services check prerequisites before deployment +- **Error Recovery**: Failed services can be individually fixed and re-run +- **Safe Retries**: Operations can be repeated without harm +- **Incremental Updates**: Configuration changes applied cleanly -This idempotent approach ensures consistent, reliable infrastructure setup and allows for incremental changes without requiring a complete teardown and rebuild. +Example recovery from cert-manager failure: +```bash +# Fix the issue, then resume +wild-service-setup cert-manager --fetch +# Continue with remaining services +wild-service-setup externaldns --fetch +```