feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation

- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided.
- Removed support for IP list format in NodeDiscover request body.
- Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint.
- Improved error handling and response messages for better clarity.

feat(cluster): Add operation tracking for cluster bootstrap process

- Integrated operations manager into cluster manager for tracking bootstrap progress.
- Refactored Bootstrap method to run asynchronously with detailed progress updates.
- Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.).

fix(discovery): Optimize node discovery process and improve maintenance mode detection

- Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans.
- Updated probeNode to detect maintenance mode more reliably.
- Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces.

refactor(node): Update node manager to handle instance-specific configurations

- Modified NewManager to accept instanceName for tailored talosconfig usage.
- Improved hardware detection logic to handle maintenance mode scenarios.

feat(operations): Implement detailed bootstrap progress tracking

- Introduced BootstrapProgress struct to track and report the status of bootstrap operations.
- Updated operation management to include bootstrap-specific details.

fix(tools): Improve talosctl command execution with context and error handling

- Added context with timeout to talosctl commands to prevent hanging on unreachable nodes.
- Enhanced error handling for version retrieval in maintenance mode.
This commit is contained in:
2025-11-04 17:16:16 +00:00
parent 005dc30aa5
commit 7cd434aabf
9 changed files with 623 additions and 148 deletions

View File

@@ -3,6 +3,7 @@ package discovery
import (
"encoding/json"
"fmt"
"net"
"os"
"path/filepath"
"sync"
@@ -28,19 +29,17 @@ func NewManager(dataDir string, instanceName string) *Manager {
return &Manager{
dataDir: dataDir,
nodeMgr: node.NewManager(dataDir),
nodeMgr: node.NewManager(dataDir, instanceName),
talosctl: tools.NewTalosconfigWithConfig(talosconfigPath),
}
}
// DiscoveredNode represents a discovered node on the network
// DiscoveredNode represents a discovered node on the network (maintenance mode only)
type DiscoveredNode struct {
IP string `json:"ip"`
Hostname string `json:"hostname,omitempty"`
MaintenanceMode bool `json:"maintenance_mode"`
Version string `json:"version,omitempty"`
Interface string `json:"interface,omitempty"`
Disks []string `json:"disks,omitempty"`
IP string `json:"ip"`
Hostname string `json:"hostname,omitempty"`
MaintenanceMode bool `json:"maintenance_mode"`
Version string `json:"version,omitempty"`
}
// DiscoveryStatus represents the current state of discovery
@@ -130,17 +129,42 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
_ = m.writeDiscoveryStatus(instanceName, status)
}()
// Discover nodes by probing each IP
discoveredNodes := []DiscoveredNode{}
// Discover nodes by probing each IP in parallel
var wg sync.WaitGroup
resultsChan := make(chan DiscoveredNode, len(ipList))
// Limit concurrent scans to avoid overwhelming the network
semaphore := make(chan struct{}, 50)
for _, ip := range ipList {
node, err := m.probeNode(ip)
if err != nil {
// Node not reachable or not a Talos node
continue
}
wg.Add(1)
go func(ip string) {
defer wg.Done()
discoveredNodes = append(discoveredNodes, *node)
// Acquire semaphore
semaphore <- struct{}{}
defer func() { <-semaphore }()
node, err := m.probeNode(ip)
if err != nil {
// Node not reachable or not a Talos node
return
}
resultsChan <- *node
}(ip)
}
// Close results channel when all goroutines complete
go func() {
wg.Wait()
close(resultsChan)
}()
// Collect results and update status incrementally
discoveredNodes := []DiscoveredNode{}
for node := range resultsChan {
discoveredNodes = append(discoveredNodes, node)
// Update status incrementally
m.discoveryMu.Lock()
@@ -151,37 +175,20 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
}
}
// probeNode attempts to detect if a node is running Talos
// probeNode attempts to detect if a node is running Talos in maintenance mode
func (m *Manager) probeNode(ip string) (*DiscoveredNode, error) {
// Attempt to get version (quick connectivity test)
version, err := m.talosctl.GetVersion(ip, false)
// Try insecure connection first (maintenance mode)
version, err := m.talosctl.GetVersion(ip, true)
if err != nil {
// Not in maintenance mode or not reachable
return nil, err
}
// Node is reachable, get hardware info
hwInfo, err := m.nodeMgr.DetectHardware(ip)
if err != nil {
// Still count it as discovered even if we can't get full hardware
return &DiscoveredNode{
IP: ip,
MaintenanceMode: false,
Version: version,
}, nil
}
// Extract just the disk paths for discovery output
diskPaths := make([]string, len(hwInfo.Disks))
for i, disk := range hwInfo.Disks {
diskPaths[i] = disk.Path
}
// If insecure connection works, node is in maintenance mode
return &DiscoveredNode{
IP: ip,
MaintenanceMode: hwInfo.MaintenanceMode,
MaintenanceMode: true,
Version: version,
Interface: hwInfo.Interface,
Disks: diskPaths,
}, nil
}
@@ -245,3 +252,132 @@ func (m *Manager) writeDiscoveryStatus(instanceName string, status *DiscoverySta
return nil
}
// CancelDiscovery cancels an in-progress discovery operation
func (m *Manager) CancelDiscovery(instanceName string) error {
m.discoveryMu.Lock()
defer m.discoveryMu.Unlock()
// Get current status
status, err := m.GetDiscoveryStatus(instanceName)
if err != nil {
return err
}
if !status.Active {
return fmt.Errorf("no discovery in progress")
}
// Mark discovery as cancelled
status.Active = false
status.Error = "Discovery cancelled by user"
if err := m.writeDiscoveryStatus(instanceName, status); err != nil {
return err
}
return nil
}
// GetLocalNetworks discovers local network interfaces and returns their CIDR addresses
// Skips loopback, link-local, and down interfaces
// Only returns IPv4 networks
func GetLocalNetworks() ([]string, error) {
interfaces, err := net.Interfaces()
if err != nil {
return nil, fmt.Errorf("failed to get network interfaces: %w", err)
}
var networks []string
for _, iface := range interfaces {
// Skip loopback and down interfaces
if iface.Flags&net.FlagLoopback != 0 || iface.Flags&net.FlagUp == 0 {
continue
}
addrs, err := iface.Addrs()
if err != nil {
continue
}
for _, addr := range addrs {
ipnet, ok := addr.(*net.IPNet)
if !ok {
continue
}
// Only IPv4 for now
if ipnet.IP.To4() == nil {
continue
}
// Skip link-local addresses (169.254.0.0/16)
if ipnet.IP.IsLinkLocalUnicast() {
continue
}
networks = append(networks, ipnet.String())
}
}
return networks, nil
}
// ExpandSubnet expands a CIDR notation subnet into individual IP addresses
// Example: "192.168.8.0/24" → ["192.168.8.1", "192.168.8.2", ..., "192.168.8.254"]
// Also handles single IPs (without CIDR notation)
func ExpandSubnet(subnet string) ([]string, error) {
// Check if it's a CIDR notation
ip, ipnet, err := net.ParseCIDR(subnet)
if err != nil {
// Not a CIDR, might be single IP
if net.ParseIP(subnet) != nil {
return []string{subnet}, nil
}
return nil, fmt.Errorf("invalid IP or CIDR: %s", subnet)
}
// Special case: /32 (single host) - just return the IP
ones, _ := ipnet.Mask.Size()
if ones == 32 {
return []string{ip.String()}, nil
}
var ips []string
// Iterate through all IPs in the subnet
for ip := ip.Mask(ipnet.Mask); ipnet.Contains(ip); incIP(ip) {
// Skip network address (first IP)
if ip.Equal(ipnet.IP) {
continue
}
// Skip broadcast address (last IP)
if isLastIP(ip, ipnet) {
continue
}
ips = append(ips, ip.String())
}
return ips, nil
}
// incIP increments an IP address
func incIP(ip net.IP) {
for j := len(ip) - 1; j >= 0; j-- {
ip[j]++
if ip[j] > 0 {
break
}
}
}
// isLastIP checks if an IP is the last IP in a subnet (broadcast address)
func isLastIP(ip net.IP, ipnet *net.IPNet) bool {
lastIP := make(net.IP, len(ip))
for i := range ip {
lastIP[i] = ip[i] | ^ipnet.Mask[i]
}
return ip.Equal(lastIP)
}