feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation
- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided. - Removed support for IP list format in NodeDiscover request body. - Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint. - Improved error handling and response messages for better clarity. feat(cluster): Add operation tracking for cluster bootstrap process - Integrated operations manager into cluster manager for tracking bootstrap progress. - Refactored Bootstrap method to run asynchronously with detailed progress updates. - Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.). fix(discovery): Optimize node discovery process and improve maintenance mode detection - Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans. - Updated probeNode to detect maintenance mode more reliably. - Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces. refactor(node): Update node manager to handle instance-specific configurations - Modified NewManager to accept instanceName for tailored talosconfig usage. - Improved hardware detection logic to handle maintenance mode scenarios. feat(operations): Implement detailed bootstrap progress tracking - Introduced BootstrapProgress struct to track and report the status of bootstrap operations. - Updated operation management to include bootstrap-specific details. fix(tools): Improve talosctl command execution with context and error handling - Added context with timeout to talosctl commands to prevent hanging on unreachable nodes. - Enhanced error handling for version retrieval in maintenance mode.
This commit is contained in:
@@ -1,10 +1,12 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Talosctl provides a thin wrapper around the talosctl command-line tool
|
||||
@@ -92,8 +94,11 @@ func (t *Talosctl) GetDisks(nodeIP string, insecure bool) ([]DiskInfo, error) {
|
||||
args = append(args, "--insecure")
|
||||
}
|
||||
|
||||
// Build args with talosconfig if available
|
||||
finalArgs := t.buildArgs(args)
|
||||
|
||||
// Use jq to slurp the NDJSON into an array (like v.PoC does with jq -s)
|
||||
talosCmd := exec.Command("talosctl", args...)
|
||||
talosCmd := exec.Command("talosctl", finalArgs...)
|
||||
jqCmd := exec.Command("jq", "-s", ".")
|
||||
|
||||
// Pipe talosctl output to jq
|
||||
@@ -171,8 +176,11 @@ func (t *Talosctl) getResourceJSON(resourceType, nodeIP string, insecure bool) (
|
||||
args = append(args, "--insecure")
|
||||
}
|
||||
|
||||
// Build args with talosconfig if available
|
||||
finalArgs := t.buildArgs(args)
|
||||
|
||||
// Use jq to slurp the NDJSON into an array
|
||||
talosCmd := exec.Command("talosctl", args...)
|
||||
talosCmd := exec.Command("talosctl", finalArgs...)
|
||||
jqCmd := exec.Command("jq", "-s", ".")
|
||||
|
||||
// Pipe talosctl output to jq
|
||||
@@ -280,20 +288,45 @@ func (t *Talosctl) GetPhysicalInterface(nodeIP string, insecure bool) (string, e
|
||||
|
||||
// GetVersion gets Talos version from a node
|
||||
func (t *Talosctl) GetVersion(nodeIP string, insecure bool) (string, error) {
|
||||
args := t.buildArgs([]string{
|
||||
"version",
|
||||
"--nodes", nodeIP,
|
||||
"--short",
|
||||
})
|
||||
var args []string
|
||||
|
||||
// When using insecure mode (for maintenance mode nodes), don't use talosconfig
|
||||
// Insecure mode is for unconfigured nodes that don't have authentication set up
|
||||
if insecure {
|
||||
args = append(args, "--insecure")
|
||||
args = []string{
|
||||
"version",
|
||||
"--nodes", nodeIP,
|
||||
"--short",
|
||||
"--insecure",
|
||||
}
|
||||
} else {
|
||||
// For configured nodes, use talosconfig if available
|
||||
args = t.buildArgs([]string{
|
||||
"version",
|
||||
"--nodes", nodeIP,
|
||||
"--short",
|
||||
})
|
||||
}
|
||||
|
||||
cmd := exec.Command("talosctl", args...)
|
||||
// Use context with timeout to prevent hanging on unreachable nodes
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "talosctl", args...)
|
||||
output, err := cmd.CombinedOutput()
|
||||
outputStr := string(output)
|
||||
|
||||
// Special case: In maintenance mode, talosctl version returns an error
|
||||
// "API is not implemented in maintenance mode" but this means the node IS reachable
|
||||
// and IS in maintenance mode, so we treat this as a success
|
||||
if err != nil && strings.Contains(outputStr, "API is not implemented in maintenance mode") {
|
||||
// Extract client version from output as the node version
|
||||
// Since we can't get server version in maintenance mode
|
||||
return "maintenance", nil
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("talosctl version failed: %w\nOutput: %s", err, string(output))
|
||||
return "", fmt.Errorf("talosctl version failed: %w\nOutput: %s", err, outputStr)
|
||||
}
|
||||
|
||||
// Parse output to extract server version
|
||||
|
||||
Reference in New Issue
Block a user