feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation

- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided. - Removed support for IP list format in NodeDiscover request body. - Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint. - Improved error handling and response messages for better clarity. feat(cluster): Add operation tracking for cluster bootstrap process - Integrated operations manager into cluster manager for tracking bootstrap progress. - Refactored Bootstrap method to run asynchronously with detailed progress updates. - Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.). fix(discovery): Optimize node discovery process and improve maintenance mode detection - Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans. - Updated probeNode to detect maintenance mode more reliably. - Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces. refactor(node): Update node manager to handle instance-specific configurations - Modified NewManager to accept instanceName for tailored talosconfig usage. - Improved hardware detection logic to handle maintenance mode scenarios. feat(operations): Implement detailed bootstrap progress tracking - Introduced BootstrapProgress struct to track and report the status of bootstrap operations. - Updated operation management to include bootstrap-specific details. fix(tools): Improve talosctl command execution with context and error handling - Added context with timeout to talosctl commands to prevent hanging on unreachable nodes. - Enhanced error handling for version retrieval in maintenance mode.
2025-11-04 17:16:16 +00:00
parent 005dc30aa5
commit 7cd434aabf
9 changed files with 623 additions and 148 deletions
--- a/internal/operations/operations.go
+++ b/internal/operations/operations.go
@@ -11,6 +11,9 @@ import (
 	"github.com/wild-cloud/wild-central/daemon/internal/tools"
 )

+// Bootstrap step constants
+const totalBootstrapSteps = 7
+
 // Manager handles async operation tracking
 type Manager struct {
 	dataDir string
@@ -23,18 +26,33 @@ func NewManager(dataDir string) *Manager {
 	}
 }

+// BootstrapProgress tracks detailed bootstrap progress
+type BootstrapProgress struct {
+	CurrentStep     int    `json:"current_step"`      // 0-6
+	StepName        string `json:"step_name"`
+	Attempt         int    `json:"attempt"`
+	MaxAttempts     int    `json:"max_attempts"`
+	StepDescription string `json:"step_description"`
+}
+
+// OperationDetails contains operation-specific details
+type OperationDetails struct {
+	BootstrapProgress *BootstrapProgress `json:"bootstrap,omitempty"`
+}
+
 // Operation represents a long-running operation
 type Operation struct {
-	ID        string    `json:"id"`
-	Type      string    `json:"type"` // discover, setup, download, bootstrap
-	Target    string    `json:"target"`
-	Instance  string    `json:"instance"`
-	Status    string    `json:"status"` // pending, running, completed, failed, cancelled
-	Message   string    `json:"message,omitempty"`
-	Progress  int       `json:"progress"`          // 0-100
-	LogFile   string    `json:"logFile,omitempty"` // Path to output log file
-	StartedAt time.Time `json:"started_at"`
-	EndedAt   time.Time `json:"ended_at,omitempty"`
+	ID        string            `json:"id"`
+	Type      string            `json:"type"` // discover, setup, download, bootstrap
+	Target    string            `json:"target"`
+	Instance  string            `json:"instance"`
+	Status    string            `json:"status"` // pending, running, completed, failed, cancelled
+	Message   string            `json:"message,omitempty"`
+	Progress  int               `json:"progress"`          // 0-100
+	Details   *OperationDetails `json:"details,omitempty"` // Operation-specific details
+	LogFile   string            `json:"logFile,omitempty"` // Path to output log file
+	StartedAt time.Time         `json:"started_at"`
+	EndedAt   time.Time         `json:"ended_at,omitempty"`
 }

 // GetOperationsDir returns the operations directory for an instance
@@ -79,19 +97,6 @@ func (m *Manager) Start(instanceName, opType, target string) (string, error) {
 	return opID, nil
 }

-// Get returns operation status
-func (m *Manager) Get(opID string) (*Operation, error) {
-	// Operation ID contains instance name, but we need to find it
-	// For now, we'll scan all instances (not ideal but simple)
-	// Better approach: encode instance in operation ID or maintain index
-
-	// Simplified: assume operation ID format is op_{type}_{target}_{timestamp}
-	// We need to know which instance to look in
-	// For now, return error if we can't find it
-
-	// This needs improvement in actual implementation
-	return nil, fmt.Errorf("operation lookup not implemented - need instance context")
-}

 // GetByInstance returns an operation for a specific instance
 func (m *Manager) GetByInstance(instanceName, opID string) (*Operation, error) {
@@ -238,6 +243,31 @@ func (m *Manager) Cleanup(instanceName string, olderThan time.Duration) error {
 	return nil
 }

+// UpdateBootstrapProgress updates bootstrap-specific progress details
+func (m *Manager) UpdateBootstrapProgress(instanceName, opID string, step int, stepName string, attempt, maxAttempts int, stepDescription string) error {
+	op, err := m.GetByInstance(instanceName, opID)
+	if err != nil {
+		return err
+	}
+
+	if op.Details == nil {
+		op.Details = &OperationDetails{}
+	}
+
+	op.Details.BootstrapProgress = &BootstrapProgress{
+		CurrentStep:     step,
+		StepName:        stepName,
+		Attempt:         attempt,
+		MaxAttempts:     maxAttempts,
+		StepDescription: stepDescription,
+	}
+
+	op.Progress = (step * 100) / (totalBootstrapSteps - 1)
+	op.Message = fmt.Sprintf("Step %d/%d: %s (attempt %d/%d)", step+1, totalBootstrapSteps, stepName, attempt, maxAttempts)
+
+	return m.writeOperation(op)
+}
+
 // writeOperation writes operation to disk
 func (m *Manager) writeOperation(op *Operation) error {
 	opsDir := m.GetOperationsDir(op.Instance)