feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation

- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided.
- Removed support for IP list format in NodeDiscover request body.
- Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint.
- Improved error handling and response messages for better clarity.

feat(cluster): Add operation tracking for cluster bootstrap process

- Integrated operations manager into cluster manager for tracking bootstrap progress.
- Refactored Bootstrap method to run asynchronously with detailed progress updates.
- Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.).

fix(discovery): Optimize node discovery process and improve maintenance mode detection

- Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans.
- Updated probeNode to detect maintenance mode more reliably.
- Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces.

refactor(node): Update node manager to handle instance-specific configurations

- Modified NewManager to accept instanceName for tailored talosconfig usage.
- Improved hardware detection logic to handle maintenance mode scenarios.

feat(operations): Implement detailed bootstrap progress tracking

- Introduced BootstrapProgress struct to track and report the status of bootstrap operations.
- Updated operation management to include bootstrap-specific details.

fix(tools): Improve talosctl command execution with context and error handling

- Added context with timeout to talosctl commands to prevent hanging on unreachable nodes.
- Enhanced error handling for version retrieval in maintenance mode.
This commit is contained in:
2025-11-04 17:16:16 +00:00
parent 005dc30aa5
commit 7cd434aabf
9 changed files with 623 additions and 148 deletions

View File

@@ -1,10 +1,12 @@
package tools
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"strings"
"time"
)
// Talosctl provides a thin wrapper around the talosctl command-line tool
@@ -92,8 +94,11 @@ func (t *Talosctl) GetDisks(nodeIP string, insecure bool) ([]DiskInfo, error) {
args = append(args, "--insecure")
}
// Build args with talosconfig if available
finalArgs := t.buildArgs(args)
// Use jq to slurp the NDJSON into an array (like v.PoC does with jq -s)
talosCmd := exec.Command("talosctl", args...)
talosCmd := exec.Command("talosctl", finalArgs...)
jqCmd := exec.Command("jq", "-s", ".")
// Pipe talosctl output to jq
@@ -171,8 +176,11 @@ func (t *Talosctl) getResourceJSON(resourceType, nodeIP string, insecure bool) (
args = append(args, "--insecure")
}
// Build args with talosconfig if available
finalArgs := t.buildArgs(args)
// Use jq to slurp the NDJSON into an array
talosCmd := exec.Command("talosctl", args...)
talosCmd := exec.Command("talosctl", finalArgs...)
jqCmd := exec.Command("jq", "-s", ".")
// Pipe talosctl output to jq
@@ -280,20 +288,45 @@ func (t *Talosctl) GetPhysicalInterface(nodeIP string, insecure bool) (string, e
// GetVersion gets Talos version from a node
func (t *Talosctl) GetVersion(nodeIP string, insecure bool) (string, error) {
args := t.buildArgs([]string{
"version",
"--nodes", nodeIP,
"--short",
})
var args []string
// When using insecure mode (for maintenance mode nodes), don't use talosconfig
// Insecure mode is for unconfigured nodes that don't have authentication set up
if insecure {
args = append(args, "--insecure")
args = []string{
"version",
"--nodes", nodeIP,
"--short",
"--insecure",
}
} else {
// For configured nodes, use talosconfig if available
args = t.buildArgs([]string{
"version",
"--nodes", nodeIP,
"--short",
})
}
cmd := exec.Command("talosctl", args...)
// Use context with timeout to prevent hanging on unreachable nodes
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "talosctl", args...)
output, err := cmd.CombinedOutput()
outputStr := string(output)
// Special case: In maintenance mode, talosctl version returns an error
// "API is not implemented in maintenance mode" but this means the node IS reachable
// and IS in maintenance mode, so we treat this as a success
if err != nil && strings.Contains(outputStr, "API is not implemented in maintenance mode") {
// Extract client version from output as the node version
// Since we can't get server version in maintenance mode
return "maintenance", nil
}
if err != nil {
return "", fmt.Errorf("talosctl version failed: %w\nOutput: %s", err, string(output))
return "", fmt.Errorf("talosctl version failed: %w\nOutput: %s", err, outputStr)
}
// Parse output to extract server version