Files
wild-cloud/setup/cluster-nodes/detect-node-hardware.sh
Paul Payne f1fe4f9cc2 Settle on v1 setup method. Test run completed successfully from bootstrap to service setup.
- Refactor dnsmasq configuration and scripts for improved variable handling and clarity
- Updated dnsmasq configuration files to use direct variable references instead of data source functions for better readability.
- Modified setup scripts to ensure they are run from the correct environment and directory, checking for the WC_HOME variable.
- Changed paths in README and scripts to reflect the new directory structure.
- Enhanced error handling in setup scripts to provide clearer guidance on required configurations.
- Adjusted kernel and initramfs URLs in boot.ipxe to use the updated variable references.
2025-06-24 15:12:53 -07:00

163 lines
5.3 KiB
Bash
Executable File

#!/bin/bash
# Node registration script for Talos cluster setup
# This script discovers hardware configuration from a node in maintenance mode
# and updates config.yaml with per-node hardware settings
set -euo pipefail
# Check if WC_HOME is set
if [ -z "${WC_HOME:-}" ]; then
echo "Error: WC_HOME environment variable not set. Run \`source ./env.sh\`."
exit 1
fi
# Usage function
usage() {
echo "Usage: register-node.sh <node-ip> <node-number>"
echo ""
echo "Register a Talos node by discovering its hardware configuration."
echo "The node must be booted in maintenance mode and accessible via IP."
echo ""
echo "Arguments:"
echo " node-ip Current IP of the node in maintenance mode"
echo " node-number Node number (1, 2, or 3) for control plane nodes"
echo ""
echo "Examples:"
echo " ./register-node.sh 192.168.8.168 1"
echo " ./register-node.sh 192.168.8.169 2"
echo ""
echo "This script will:"
echo " - Query the node for available network interfaces"
echo " - Query the node for available disks"
echo " - Update config.yaml with the per-node hardware settings"
echo " - Update patch templates to use per-node hardware"
}
# Parse arguments
if [ $# -ne 2 ]; then
usage
exit 1
fi
NODE_IP="$1"
NODE_NUMBER="$2"
# Validate node number
if [[ ! "$NODE_NUMBER" =~ ^[1-3]$ ]]; then
echo "Error: Node number must be 1, 2, or 3"
exit 1
fi
echo "Registering Talos control plane node $NODE_NUMBER at $NODE_IP..."
# Test connectivity
echo "Testing connectivity to node..."
if ! talosctl -n "$NODE_IP" get links --insecure >/dev/null 2>&1; then
echo "Error: Cannot connect to node at $NODE_IP"
echo "Make sure the node is booted in maintenance mode and accessible."
exit 1
fi
echo "✅ Node is accessible"
# Discover network interfaces
echo "Discovering network interfaces..."
# First, try to find the interface that's actually carrying traffic (has the default route)
CONNECTED_INTERFACE=$(talosctl -n "$NODE_IP" get routes --insecure -o json 2>/dev/null | \
jq -s -r '.[] | select(.spec.destination == "0.0.0.0/0" and .spec.gateway != null) | .spec.outLinkName' | \
head -1)
if [ -n "$CONNECTED_INTERFACE" ]; then
ACTIVE_INTERFACE="$CONNECTED_INTERFACE"
echo "✅ Discovered connected interface (with default route): $ACTIVE_INTERFACE"
else
# Fallback: find any active ethernet interface
echo "No default route found, checking for active ethernet interfaces..."
ACTIVE_INTERFACE=$(talosctl -n "$NODE_IP" get links --insecure -o json 2>/dev/null | \
jq -s -r '.[] | select(.spec.operationalState == "up" and .spec.type == "ether" and .metadata.id != "lo") | .metadata.id' | \
head -1)
if [ -z "$ACTIVE_INTERFACE" ]; then
echo "Error: No active ethernet interface found"
echo "Available interfaces:"
talosctl -n "$NODE_IP" get links --insecure
echo ""
echo "Available routes:"
talosctl -n "$NODE_IP" get routes --insecure
exit 1
fi
echo "✅ Discovered active interface: $ACTIVE_INTERFACE"
fi
# Discover available disks
echo "Discovering available disks..."
AVAILABLE_DISKS=$(talosctl -n "$NODE_IP" get disks --insecure -o json 2>/dev/null | \
jq -s -r '.[] | select(.spec.size > 10000000000) | .metadata.id' | \
head -5)
if [ -z "$AVAILABLE_DISKS" ]; then
echo "Error: No suitable disks found (must be >10GB)"
echo "Available disks:"
talosctl -n "$NODE_IP" get disks --insecure
exit 1
fi
echo "Available disks (>10GB):"
echo "$AVAILABLE_DISKS"
echo ""
# Let user choose disk
echo "Select installation disk for node $NODE_NUMBER:"
select INSTALL_DISK in $AVAILABLE_DISKS; do
if [ -n "${INSTALL_DISK:-}" ]; then
break
fi
echo "Invalid selection. Please try again."
done
# Add /dev/ prefix if not present
if [[ "$INSTALL_DISK" != /dev/* ]]; then
INSTALL_DISK="/dev/$INSTALL_DISK"
fi
echo "✅ Selected disk: $INSTALL_DISK"
# Update config.yaml with per-node configuration
echo "Updating config.yaml with node $NODE_NUMBER configuration..."
CONFIG_FILE="${WC_HOME}/config.yaml"
# Get the target IP for this node from the existing config
TARGET_IP=$(yq eval ".cluster.nodes.control.node${NODE_NUMBER}.ip" "$CONFIG_FILE")
# Use yq to update the per-node configuration
yq eval ".cluster.nodes.control.node${NODE_NUMBER}.ip = \"$TARGET_IP\"" -i "$CONFIG_FILE"
yq eval ".cluster.nodes.control.node${NODE_NUMBER}.interface = \"$ACTIVE_INTERFACE\"" -i "$CONFIG_FILE"
yq eval ".cluster.nodes.control.node${NODE_NUMBER}.disk = \"$INSTALL_DISK\"" -i "$CONFIG_FILE"
echo "✅ Updated config.yaml for node $NODE_NUMBER:"
echo " - Target IP: $TARGET_IP"
echo " - Network interface: $ACTIVE_INTERFACE"
echo " - Installation disk: $INSTALL_DISK"
echo ""
echo "🎉 Node $NODE_NUMBER registration complete!"
echo ""
echo "Node configuration saved:"
echo " - Target IP: $TARGET_IP"
echo " - Interface: $ACTIVE_INTERFACE"
echo " - Disk: $INSTALL_DISK"
echo ""
echo "Next steps:"
echo "1. Regenerate machine configurations:"
echo " ./generate-machine-configs.sh"
echo ""
echo "2. Apply configuration to this node:"
echo " talosctl apply-config --insecure -n $NODE_IP --file final/controlplane-node-${NODE_NUMBER}.yaml"
echo ""
echo "3. Wait for reboot and verify static IP connectivity"
echo "4. Repeat registration for additional control plane nodes"