- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided. - Removed support for IP list format in NodeDiscover request body. - Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint. - Improved error handling and response messages for better clarity. feat(cluster): Add operation tracking for cluster bootstrap process - Integrated operations manager into cluster manager for tracking bootstrap progress. - Refactored Bootstrap method to run asynchronously with detailed progress updates. - Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.). fix(discovery): Optimize node discovery process and improve maintenance mode detection - Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans. - Updated probeNode to detect maintenance mode more reliably. - Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces. refactor(node): Update node manager to handle instance-specific configurations - Modified NewManager to accept instanceName for tailored talosconfig usage. - Improved hardware detection logic to handle maintenance mode scenarios. feat(operations): Implement detailed bootstrap progress tracking - Introduced BootstrapProgress struct to track and report the status of bootstrap operations. - Updated operation management to include bootstrap-specific details. fix(tools): Improve talosctl command execution with context and error handling - Added context with timeout to talosctl commands to prevent hanging on unreachable nodes. - Enhanced error handling for version retrieval in maintenance mode.
384 lines
9.4 KiB
Go
384 lines
9.4 KiB
Go
package discovery
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/wild-cloud/wild-central/daemon/internal/node"
|
|
"github.com/wild-cloud/wild-central/daemon/internal/storage"
|
|
"github.com/wild-cloud/wild-central/daemon/internal/tools"
|
|
)
|
|
|
|
// Manager handles node discovery operations
|
|
type Manager struct {
|
|
dataDir string
|
|
nodeMgr *node.Manager
|
|
talosctl *tools.Talosctl
|
|
discoveryMu sync.Mutex
|
|
}
|
|
|
|
// NewManager creates a new discovery manager
|
|
func NewManager(dataDir string, instanceName string) *Manager {
|
|
// Get talosconfig path for the instance
|
|
talosconfigPath := tools.GetTalosconfigPath(dataDir, instanceName)
|
|
|
|
return &Manager{
|
|
dataDir: dataDir,
|
|
nodeMgr: node.NewManager(dataDir, instanceName),
|
|
talosctl: tools.NewTalosconfigWithConfig(talosconfigPath),
|
|
}
|
|
}
|
|
|
|
// DiscoveredNode represents a discovered node on the network (maintenance mode only)
|
|
type DiscoveredNode struct {
|
|
IP string `json:"ip"`
|
|
Hostname string `json:"hostname,omitempty"`
|
|
MaintenanceMode bool `json:"maintenance_mode"`
|
|
Version string `json:"version,omitempty"`
|
|
}
|
|
|
|
// DiscoveryStatus represents the current state of discovery
|
|
type DiscoveryStatus struct {
|
|
Active bool `json:"active"`
|
|
StartedAt time.Time `json:"started_at,omitempty"`
|
|
NodesFound []DiscoveredNode `json:"nodes_found"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// GetDiscoveryDir returns the discovery directory for an instance
|
|
func (m *Manager) GetDiscoveryDir(instanceName string) string {
|
|
return tools.GetInstanceDiscoveryPath(m.dataDir, instanceName)
|
|
}
|
|
|
|
// GetDiscoveryStatusPath returns the path to discovery status file
|
|
func (m *Manager) GetDiscoveryStatusPath(instanceName string) string {
|
|
return filepath.Join(m.GetDiscoveryDir(instanceName), "status.json")
|
|
}
|
|
|
|
// GetDiscoveryStatus returns current discovery operation status
|
|
func (m *Manager) GetDiscoveryStatus(instanceName string) (*DiscoveryStatus, error) {
|
|
statusPath := m.GetDiscoveryStatusPath(instanceName)
|
|
|
|
if !storage.FileExists(statusPath) {
|
|
// No discovery has been run yet
|
|
return &DiscoveryStatus{
|
|
Active: false,
|
|
NodesFound: []DiscoveredNode{},
|
|
}, nil
|
|
}
|
|
|
|
data, err := os.ReadFile(statusPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read discovery status: %w", err)
|
|
}
|
|
|
|
var status DiscoveryStatus
|
|
if err := json.Unmarshal(data, &status); err != nil {
|
|
return nil, fmt.Errorf("failed to parse discovery status: %w", err)
|
|
}
|
|
|
|
return &status, nil
|
|
}
|
|
|
|
// StartDiscovery initiates an async discovery operation
|
|
func (m *Manager) StartDiscovery(instanceName string, ipList []string) error {
|
|
m.discoveryMu.Lock()
|
|
defer m.discoveryMu.Unlock()
|
|
|
|
// Check if discovery is already running
|
|
status, err := m.GetDiscoveryStatus(instanceName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if status.Active {
|
|
return fmt.Errorf("discovery already in progress")
|
|
}
|
|
|
|
// Initialize discovery status
|
|
newStatus := &DiscoveryStatus{
|
|
Active: true,
|
|
StartedAt: time.Now(),
|
|
NodesFound: []DiscoveredNode{},
|
|
}
|
|
|
|
if err := m.writeDiscoveryStatus(instanceName, newStatus); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Start discovery in background
|
|
go m.runDiscovery(instanceName, ipList)
|
|
|
|
return nil
|
|
}
|
|
|
|
// runDiscovery performs the actual discovery operation
|
|
func (m *Manager) runDiscovery(instanceName string, ipList []string) {
|
|
defer func() {
|
|
// Mark discovery as complete
|
|
m.discoveryMu.Lock()
|
|
defer m.discoveryMu.Unlock()
|
|
|
|
status, _ := m.GetDiscoveryStatus(instanceName)
|
|
status.Active = false
|
|
_ = m.writeDiscoveryStatus(instanceName, status)
|
|
}()
|
|
|
|
// Discover nodes by probing each IP in parallel
|
|
var wg sync.WaitGroup
|
|
resultsChan := make(chan DiscoveredNode, len(ipList))
|
|
|
|
// Limit concurrent scans to avoid overwhelming the network
|
|
semaphore := make(chan struct{}, 50)
|
|
|
|
for _, ip := range ipList {
|
|
wg.Add(1)
|
|
go func(ip string) {
|
|
defer wg.Done()
|
|
|
|
// Acquire semaphore
|
|
semaphore <- struct{}{}
|
|
defer func() { <-semaphore }()
|
|
|
|
node, err := m.probeNode(ip)
|
|
if err != nil {
|
|
// Node not reachable or not a Talos node
|
|
return
|
|
}
|
|
|
|
resultsChan <- *node
|
|
}(ip)
|
|
}
|
|
|
|
// Close results channel when all goroutines complete
|
|
go func() {
|
|
wg.Wait()
|
|
close(resultsChan)
|
|
}()
|
|
|
|
// Collect results and update status incrementally
|
|
discoveredNodes := []DiscoveredNode{}
|
|
for node := range resultsChan {
|
|
discoveredNodes = append(discoveredNodes, node)
|
|
|
|
// Update status incrementally
|
|
m.discoveryMu.Lock()
|
|
status, _ := m.GetDiscoveryStatus(instanceName)
|
|
status.NodesFound = discoveredNodes
|
|
_ = m.writeDiscoveryStatus(instanceName, status)
|
|
m.discoveryMu.Unlock()
|
|
}
|
|
}
|
|
|
|
// probeNode attempts to detect if a node is running Talos in maintenance mode
|
|
func (m *Manager) probeNode(ip string) (*DiscoveredNode, error) {
|
|
// Try insecure connection first (maintenance mode)
|
|
version, err := m.talosctl.GetVersion(ip, true)
|
|
if err != nil {
|
|
// Not in maintenance mode or not reachable
|
|
return nil, err
|
|
}
|
|
|
|
// If insecure connection works, node is in maintenance mode
|
|
return &DiscoveredNode{
|
|
IP: ip,
|
|
MaintenanceMode: true,
|
|
Version: version,
|
|
}, nil
|
|
}
|
|
|
|
// DiscoverNodes performs synchronous discovery (for simple cases)
|
|
func (m *Manager) DiscoverNodes(instanceName string, ipList []string) ([]DiscoveredNode, error) {
|
|
nodes := []DiscoveredNode{}
|
|
|
|
for _, ip := range ipList {
|
|
node, err := m.probeNode(ip)
|
|
if err != nil {
|
|
// Skip unreachable nodes
|
|
continue
|
|
}
|
|
nodes = append(nodes, *node)
|
|
}
|
|
|
|
// Save results
|
|
status := &DiscoveryStatus{
|
|
Active: false,
|
|
StartedAt: time.Now(),
|
|
NodesFound: nodes,
|
|
}
|
|
|
|
if err := m.writeDiscoveryStatus(instanceName, status); err != nil {
|
|
return nodes, err // Return nodes even if we can't save status
|
|
}
|
|
|
|
return nodes, nil
|
|
}
|
|
|
|
// ClearDiscoveryStatus removes discovery status file
|
|
func (m *Manager) ClearDiscoveryStatus(instanceName string) error {
|
|
statusPath := m.GetDiscoveryStatusPath(instanceName)
|
|
|
|
if !storage.FileExists(statusPath) {
|
|
return nil // Already cleared, idempotent
|
|
}
|
|
|
|
return os.Remove(statusPath)
|
|
}
|
|
|
|
// writeDiscoveryStatus writes discovery status to disk
|
|
func (m *Manager) writeDiscoveryStatus(instanceName string, status *DiscoveryStatus) error {
|
|
discoveryDir := m.GetDiscoveryDir(instanceName)
|
|
|
|
// Ensure directory exists
|
|
if err := storage.EnsureDir(discoveryDir, 0755); err != nil {
|
|
return err
|
|
}
|
|
|
|
statusPath := m.GetDiscoveryStatusPath(instanceName)
|
|
|
|
data, err := json.MarshalIndent(status, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal discovery status: %w", err)
|
|
}
|
|
|
|
if err := storage.WriteFile(statusPath, data, 0644); err != nil {
|
|
return fmt.Errorf("failed to write discovery status: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// CancelDiscovery cancels an in-progress discovery operation
|
|
func (m *Manager) CancelDiscovery(instanceName string) error {
|
|
m.discoveryMu.Lock()
|
|
defer m.discoveryMu.Unlock()
|
|
|
|
// Get current status
|
|
status, err := m.GetDiscoveryStatus(instanceName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !status.Active {
|
|
return fmt.Errorf("no discovery in progress")
|
|
}
|
|
|
|
// Mark discovery as cancelled
|
|
status.Active = false
|
|
status.Error = "Discovery cancelled by user"
|
|
|
|
if err := m.writeDiscoveryStatus(instanceName, status); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetLocalNetworks discovers local network interfaces and returns their CIDR addresses
|
|
// Skips loopback, link-local, and down interfaces
|
|
// Only returns IPv4 networks
|
|
func GetLocalNetworks() ([]string, error) {
|
|
interfaces, err := net.Interfaces()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get network interfaces: %w", err)
|
|
}
|
|
|
|
var networks []string
|
|
for _, iface := range interfaces {
|
|
// Skip loopback and down interfaces
|
|
if iface.Flags&net.FlagLoopback != 0 || iface.Flags&net.FlagUp == 0 {
|
|
continue
|
|
}
|
|
|
|
addrs, err := iface.Addrs()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
for _, addr := range addrs {
|
|
ipnet, ok := addr.(*net.IPNet)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
// Only IPv4 for now
|
|
if ipnet.IP.To4() == nil {
|
|
continue
|
|
}
|
|
|
|
// Skip link-local addresses (169.254.0.0/16)
|
|
if ipnet.IP.IsLinkLocalUnicast() {
|
|
continue
|
|
}
|
|
|
|
networks = append(networks, ipnet.String())
|
|
}
|
|
}
|
|
|
|
return networks, nil
|
|
}
|
|
|
|
// ExpandSubnet expands a CIDR notation subnet into individual IP addresses
|
|
// Example: "192.168.8.0/24" → ["192.168.8.1", "192.168.8.2", ..., "192.168.8.254"]
|
|
// Also handles single IPs (without CIDR notation)
|
|
func ExpandSubnet(subnet string) ([]string, error) {
|
|
// Check if it's a CIDR notation
|
|
ip, ipnet, err := net.ParseCIDR(subnet)
|
|
if err != nil {
|
|
// Not a CIDR, might be single IP
|
|
if net.ParseIP(subnet) != nil {
|
|
return []string{subnet}, nil
|
|
}
|
|
return nil, fmt.Errorf("invalid IP or CIDR: %s", subnet)
|
|
}
|
|
|
|
// Special case: /32 (single host) - just return the IP
|
|
ones, _ := ipnet.Mask.Size()
|
|
if ones == 32 {
|
|
return []string{ip.String()}, nil
|
|
}
|
|
|
|
var ips []string
|
|
|
|
// Iterate through all IPs in the subnet
|
|
for ip := ip.Mask(ipnet.Mask); ipnet.Contains(ip); incIP(ip) {
|
|
// Skip network address (first IP)
|
|
if ip.Equal(ipnet.IP) {
|
|
continue
|
|
}
|
|
|
|
// Skip broadcast address (last IP)
|
|
if isLastIP(ip, ipnet) {
|
|
continue
|
|
}
|
|
|
|
ips = append(ips, ip.String())
|
|
}
|
|
|
|
return ips, nil
|
|
}
|
|
|
|
// incIP increments an IP address
|
|
func incIP(ip net.IP) {
|
|
for j := len(ip) - 1; j >= 0; j-- {
|
|
ip[j]++
|
|
if ip[j] > 0 {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// isLastIP checks if an IP is the last IP in a subnet (broadcast address)
|
|
func isLastIP(ip net.IP, ipnet *net.IPNet) bool {
|
|
lastIP := make(net.IP, len(ip))
|
|
for i := range ip {
|
|
lastIP[i] = ip[i] | ^ipnet.Mask[i]
|
|
}
|
|
return ip.Equal(lastIP)
|
|
}
|