feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation

- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided.
- Removed support for IP list format in NodeDiscover request body.
- Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint.
- Improved error handling and response messages for better clarity.

feat(cluster): Add operation tracking for cluster bootstrap process

- Integrated operations manager into cluster manager for tracking bootstrap progress.
- Refactored Bootstrap method to run asynchronously with detailed progress updates.
- Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.).

fix(discovery): Optimize node discovery process and improve maintenance mode detection

- Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans.
- Updated probeNode to detect maintenance mode more reliably.
- Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces.

refactor(node): Update node manager to handle instance-specific configurations

- Modified NewManager to accept instanceName for tailored talosconfig usage.
- Improved hardware detection logic to handle maintenance mode scenarios.

feat(operations): Implement detailed bootstrap progress tracking

- Introduced BootstrapProgress struct to track and report the status of bootstrap operations.
- Updated operation management to include bootstrap-specific details.

fix(tools): Improve talosctl command execution with context and error handling

- Added context with timeout to talosctl commands to prevent hanging on unreachable nodes.
- Enhanced error handling for version retrieval in maintenance mode.
This commit is contained in:
2025-11-04 17:16:16 +00:00
parent 005dc30aa5
commit 7cd434aabf
9 changed files with 623 additions and 148 deletions

View File

@@ -11,6 +11,9 @@ import (
"github.com/wild-cloud/wild-central/daemon/internal/tools"
)
// Bootstrap step constants
const totalBootstrapSteps = 7
// Manager handles async operation tracking
type Manager struct {
dataDir string
@@ -23,18 +26,33 @@ func NewManager(dataDir string) *Manager {
}
}
// BootstrapProgress tracks detailed bootstrap progress
type BootstrapProgress struct {
CurrentStep int `json:"current_step"` // 0-6
StepName string `json:"step_name"`
Attempt int `json:"attempt"`
MaxAttempts int `json:"max_attempts"`
StepDescription string `json:"step_description"`
}
// OperationDetails contains operation-specific details
type OperationDetails struct {
BootstrapProgress *BootstrapProgress `json:"bootstrap,omitempty"`
}
// Operation represents a long-running operation
type Operation struct {
ID string `json:"id"`
Type string `json:"type"` // discover, setup, download, bootstrap
Target string `json:"target"`
Instance string `json:"instance"`
Status string `json:"status"` // pending, running, completed, failed, cancelled
Message string `json:"message,omitempty"`
Progress int `json:"progress"` // 0-100
LogFile string `json:"logFile,omitempty"` // Path to output log file
StartedAt time.Time `json:"started_at"`
EndedAt time.Time `json:"ended_at,omitempty"`
ID string `json:"id"`
Type string `json:"type"` // discover, setup, download, bootstrap
Target string `json:"target"`
Instance string `json:"instance"`
Status string `json:"status"` // pending, running, completed, failed, cancelled
Message string `json:"message,omitempty"`
Progress int `json:"progress"` // 0-100
Details *OperationDetails `json:"details,omitempty"` // Operation-specific details
LogFile string `json:"logFile,omitempty"` // Path to output log file
StartedAt time.Time `json:"started_at"`
EndedAt time.Time `json:"ended_at,omitempty"`
}
// GetOperationsDir returns the operations directory for an instance
@@ -79,19 +97,6 @@ func (m *Manager) Start(instanceName, opType, target string) (string, error) {
return opID, nil
}
// Get returns operation status
func (m *Manager) Get(opID string) (*Operation, error) {
// Operation ID contains instance name, but we need to find it
// For now, we'll scan all instances (not ideal but simple)
// Better approach: encode instance in operation ID or maintain index
// Simplified: assume operation ID format is op_{type}_{target}_{timestamp}
// We need to know which instance to look in
// For now, return error if we can't find it
// This needs improvement in actual implementation
return nil, fmt.Errorf("operation lookup not implemented - need instance context")
}
// GetByInstance returns an operation for a specific instance
func (m *Manager) GetByInstance(instanceName, opID string) (*Operation, error) {
@@ -238,6 +243,31 @@ func (m *Manager) Cleanup(instanceName string, olderThan time.Duration) error {
return nil
}
// UpdateBootstrapProgress updates bootstrap-specific progress details
func (m *Manager) UpdateBootstrapProgress(instanceName, opID string, step int, stepName string, attempt, maxAttempts int, stepDescription string) error {
op, err := m.GetByInstance(instanceName, opID)
if err != nil {
return err
}
if op.Details == nil {
op.Details = &OperationDetails{}
}
op.Details.BootstrapProgress = &BootstrapProgress{
CurrentStep: step,
StepName: stepName,
Attempt: attempt,
MaxAttempts: maxAttempts,
StepDescription: stepDescription,
}
op.Progress = (step * 100) / (totalBootstrapSteps - 1)
op.Message = fmt.Sprintf("Step %d/%d: %s (attempt %d/%d)", step+1, totalBootstrapSteps, stepName, attempt, maxAttempts)
return m.writeOperation(op)
}
// writeOperation writes operation to disk
func (m *Manager) writeOperation(op *Operation) error {
opsDir := m.GetOperationsDir(op.Instance)