feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation
- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided. - Removed support for IP list format in NodeDiscover request body. - Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint. - Improved error handling and response messages for better clarity. feat(cluster): Add operation tracking for cluster bootstrap process - Integrated operations manager into cluster manager for tracking bootstrap progress. - Refactored Bootstrap method to run asynchronously with detailed progress updates. - Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.). fix(discovery): Optimize node discovery process and improve maintenance mode detection - Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans. - Updated probeNode to detect maintenance mode more reliably. - Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces. refactor(node): Update node manager to handle instance-specific configurations - Modified NewManager to accept instanceName for tailored talosconfig usage. - Improved hardware detection logic to handle maintenance mode scenarios. feat(operations): Implement detailed bootstrap progress tracking - Introduced BootstrapProgress struct to track and report the status of bootstrap operations. - Updated operation management to include bootstrap-specific details. fix(tools): Improve talosctl command execution with context and error handling - Added context with timeout to talosctl commands to prevent hanging on unreachable nodes. - Enhanced error handling for version retrieval in maintenance mode.
This commit is contained in:
@@ -3,6 +3,7 @@ package discovery
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
@@ -28,19 +29,17 @@ func NewManager(dataDir string, instanceName string) *Manager {
|
||||
|
||||
return &Manager{
|
||||
dataDir: dataDir,
|
||||
nodeMgr: node.NewManager(dataDir),
|
||||
nodeMgr: node.NewManager(dataDir, instanceName),
|
||||
talosctl: tools.NewTalosconfigWithConfig(talosconfigPath),
|
||||
}
|
||||
}
|
||||
|
||||
// DiscoveredNode represents a discovered node on the network
|
||||
// DiscoveredNode represents a discovered node on the network (maintenance mode only)
|
||||
type DiscoveredNode struct {
|
||||
IP string `json:"ip"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
MaintenanceMode bool `json:"maintenance_mode"`
|
||||
Version string `json:"version,omitempty"`
|
||||
Interface string `json:"interface,omitempty"`
|
||||
Disks []string `json:"disks,omitempty"`
|
||||
IP string `json:"ip"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
MaintenanceMode bool `json:"maintenance_mode"`
|
||||
Version string `json:"version,omitempty"`
|
||||
}
|
||||
|
||||
// DiscoveryStatus represents the current state of discovery
|
||||
@@ -130,17 +129,42 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
|
||||
_ = m.writeDiscoveryStatus(instanceName, status)
|
||||
}()
|
||||
|
||||
// Discover nodes by probing each IP
|
||||
discoveredNodes := []DiscoveredNode{}
|
||||
// Discover nodes by probing each IP in parallel
|
||||
var wg sync.WaitGroup
|
||||
resultsChan := make(chan DiscoveredNode, len(ipList))
|
||||
|
||||
// Limit concurrent scans to avoid overwhelming the network
|
||||
semaphore := make(chan struct{}, 50)
|
||||
|
||||
for _, ip := range ipList {
|
||||
node, err := m.probeNode(ip)
|
||||
if err != nil {
|
||||
// Node not reachable or not a Talos node
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(ip string) {
|
||||
defer wg.Done()
|
||||
|
||||
discoveredNodes = append(discoveredNodes, *node)
|
||||
// Acquire semaphore
|
||||
semaphore <- struct{}{}
|
||||
defer func() { <-semaphore }()
|
||||
|
||||
node, err := m.probeNode(ip)
|
||||
if err != nil {
|
||||
// Node not reachable or not a Talos node
|
||||
return
|
||||
}
|
||||
|
||||
resultsChan <- *node
|
||||
}(ip)
|
||||
}
|
||||
|
||||
// Close results channel when all goroutines complete
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(resultsChan)
|
||||
}()
|
||||
|
||||
// Collect results and update status incrementally
|
||||
discoveredNodes := []DiscoveredNode{}
|
||||
for node := range resultsChan {
|
||||
discoveredNodes = append(discoveredNodes, node)
|
||||
|
||||
// Update status incrementally
|
||||
m.discoveryMu.Lock()
|
||||
@@ -151,37 +175,20 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
|
||||
}
|
||||
}
|
||||
|
||||
// probeNode attempts to detect if a node is running Talos
|
||||
// probeNode attempts to detect if a node is running Talos in maintenance mode
|
||||
func (m *Manager) probeNode(ip string) (*DiscoveredNode, error) {
|
||||
// Attempt to get version (quick connectivity test)
|
||||
version, err := m.talosctl.GetVersion(ip, false)
|
||||
// Try insecure connection first (maintenance mode)
|
||||
version, err := m.talosctl.GetVersion(ip, true)
|
||||
if err != nil {
|
||||
// Not in maintenance mode or not reachable
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Node is reachable, get hardware info
|
||||
hwInfo, err := m.nodeMgr.DetectHardware(ip)
|
||||
if err != nil {
|
||||
// Still count it as discovered even if we can't get full hardware
|
||||
return &DiscoveredNode{
|
||||
IP: ip,
|
||||
MaintenanceMode: false,
|
||||
Version: version,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Extract just the disk paths for discovery output
|
||||
diskPaths := make([]string, len(hwInfo.Disks))
|
||||
for i, disk := range hwInfo.Disks {
|
||||
diskPaths[i] = disk.Path
|
||||
}
|
||||
|
||||
// If insecure connection works, node is in maintenance mode
|
||||
return &DiscoveredNode{
|
||||
IP: ip,
|
||||
MaintenanceMode: hwInfo.MaintenanceMode,
|
||||
MaintenanceMode: true,
|
||||
Version: version,
|
||||
Interface: hwInfo.Interface,
|
||||
Disks: diskPaths,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -245,3 +252,132 @@ func (m *Manager) writeDiscoveryStatus(instanceName string, status *DiscoverySta
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// CancelDiscovery cancels an in-progress discovery operation
|
||||
func (m *Manager) CancelDiscovery(instanceName string) error {
|
||||
m.discoveryMu.Lock()
|
||||
defer m.discoveryMu.Unlock()
|
||||
|
||||
// Get current status
|
||||
status, err := m.GetDiscoveryStatus(instanceName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !status.Active {
|
||||
return fmt.Errorf("no discovery in progress")
|
||||
}
|
||||
|
||||
// Mark discovery as cancelled
|
||||
status.Active = false
|
||||
status.Error = "Discovery cancelled by user"
|
||||
|
||||
if err := m.writeDiscoveryStatus(instanceName, status); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetLocalNetworks discovers local network interfaces and returns their CIDR addresses
|
||||
// Skips loopback, link-local, and down interfaces
|
||||
// Only returns IPv4 networks
|
||||
func GetLocalNetworks() ([]string, error) {
|
||||
interfaces, err := net.Interfaces()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get network interfaces: %w", err)
|
||||
}
|
||||
|
||||
var networks []string
|
||||
for _, iface := range interfaces {
|
||||
// Skip loopback and down interfaces
|
||||
if iface.Flags&net.FlagLoopback != 0 || iface.Flags&net.FlagUp == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
addrs, err := iface.Addrs()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, addr := range addrs {
|
||||
ipnet, ok := addr.(*net.IPNet)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Only IPv4 for now
|
||||
if ipnet.IP.To4() == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip link-local addresses (169.254.0.0/16)
|
||||
if ipnet.IP.IsLinkLocalUnicast() {
|
||||
continue
|
||||
}
|
||||
|
||||
networks = append(networks, ipnet.String())
|
||||
}
|
||||
}
|
||||
|
||||
return networks, nil
|
||||
}
|
||||
|
||||
// ExpandSubnet expands a CIDR notation subnet into individual IP addresses
|
||||
// Example: "192.168.8.0/24" → ["192.168.8.1", "192.168.8.2", ..., "192.168.8.254"]
|
||||
// Also handles single IPs (without CIDR notation)
|
||||
func ExpandSubnet(subnet string) ([]string, error) {
|
||||
// Check if it's a CIDR notation
|
||||
ip, ipnet, err := net.ParseCIDR(subnet)
|
||||
if err != nil {
|
||||
// Not a CIDR, might be single IP
|
||||
if net.ParseIP(subnet) != nil {
|
||||
return []string{subnet}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("invalid IP or CIDR: %s", subnet)
|
||||
}
|
||||
|
||||
// Special case: /32 (single host) - just return the IP
|
||||
ones, _ := ipnet.Mask.Size()
|
||||
if ones == 32 {
|
||||
return []string{ip.String()}, nil
|
||||
}
|
||||
|
||||
var ips []string
|
||||
|
||||
// Iterate through all IPs in the subnet
|
||||
for ip := ip.Mask(ipnet.Mask); ipnet.Contains(ip); incIP(ip) {
|
||||
// Skip network address (first IP)
|
||||
if ip.Equal(ipnet.IP) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip broadcast address (last IP)
|
||||
if isLastIP(ip, ipnet) {
|
||||
continue
|
||||
}
|
||||
|
||||
ips = append(ips, ip.String())
|
||||
}
|
||||
|
||||
return ips, nil
|
||||
}
|
||||
|
||||
// incIP increments an IP address
|
||||
func incIP(ip net.IP) {
|
||||
for j := len(ip) - 1; j >= 0; j-- {
|
||||
ip[j]++
|
||||
if ip[j] > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// isLastIP checks if an IP is the last IP in a subnet (broadcast address)
|
||||
func isLastIP(ip net.IP, ipnet *net.IPNet) bool {
|
||||
lastIP := make(net.IP, len(ip))
|
||||
for i := range ip {
|
||||
lastIP[i] = ip[i] | ^ipnet.Mask[i]
|
||||
}
|
||||
return ip.Equal(lastIP)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user