feat(api): Enhance NodeDiscover with subnet auto-detection and discovery cancellation

- Updated NodeDiscover to accept an optional subnet parameter, with auto-detection of local networks if none is provided.
- Removed support for IP list format in NodeDiscover request body.
- Implemented discovery cancellation functionality with NodeDiscoveryCancel endpoint.
- Improved error handling and response messages for better clarity.

feat(cluster): Add operation tracking for cluster bootstrap process

- Integrated operations manager into cluster manager for tracking bootstrap progress.
- Refactored Bootstrap method to run asynchronously with detailed progress updates.
- Added methods to wait for various bootstrap steps (etcd health, VIP assignment, control plane readiness, etc.).

fix(discovery): Optimize node discovery process and improve maintenance mode detection

- Enhanced node discovery to run in parallel with a semaphore to limit concurrent scans.
- Updated probeNode to detect maintenance mode more reliably.
- Added functions to expand CIDR notation into individual IP addresses and retrieve local network interfaces.

refactor(node): Update node manager to handle instance-specific configurations

- Modified NewManager to accept instanceName for tailored talosconfig usage.
- Improved hardware detection logic to handle maintenance mode scenarios.

feat(operations): Implement detailed bootstrap progress tracking

- Introduced BootstrapProgress struct to track and report the status of bootstrap operations.
- Updated operation management to include bootstrap-specific details.

fix(tools): Improve talosctl command execution with context and error handling

- Added context with timeout to talosctl commands to prevent hanging on unreachable nodes.
- Enhanced error handling for version retrieval in maintenance mode.
This commit is contained in:
2025-11-04 17:16:16 +00:00
parent 005dc30aa5
commit 7cd434aabf
9 changed files with 623 additions and 148 deletions

View File

@@ -30,6 +30,7 @@ type API struct {
context *context.Manager
instance *instance.Manager
dnsmasq *dnsmasq.ConfigGenerator
opsMgr *operations.Manager // Operations manager
broadcaster *operations.Broadcaster // SSE broadcaster for operation output
}
@@ -57,6 +58,7 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
context: context.NewManager(dataDir),
instance: instance.NewManager(dataDir),
dnsmasq: dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
opsMgr: operations.NewManager(dataDir),
broadcaster: operations.NewBroadcaster(),
}, nil
}
@@ -85,6 +87,7 @@ func (api *API) RegisterRoutes(r *mux.Router) {
r.HandleFunc("/api/v1/instances/{name}/nodes/discover", api.NodeDiscover).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes/detect", api.NodeDetect).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/discovery", api.NodeDiscoveryStatus).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/discovery/cancel", api.NodeDiscoveryCancel).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes/hardware/{ip}", api.NodeHardware).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/nodes/fetch-templates", api.NodeFetchTemplates).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes", api.NodeAdd).Methods("POST")

View File

@@ -46,15 +46,15 @@ func (api *API) ClusterGenerateConfig(w http.ResponseWriter, r *http.Request) {
}
// Create cluster config
config := cluster.ClusterConfig{
clusterConfig := cluster.ClusterConfig{
ClusterName: clusterName,
VIP: vip,
Version: version,
}
// Generate configuration
clusterMgr := cluster.NewManager(api.dataDir)
if err := clusterMgr.GenerateConfig(instanceName, &config); err != nil {
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
if err := clusterMgr.GenerateConfig(instanceName, &clusterConfig); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to generate config: %v", err))
return
}
@@ -90,26 +90,14 @@ func (api *API) ClusterBootstrap(w http.ResponseWriter, r *http.Request) {
return
}
// Start bootstrap operation
opsMgr := operations.NewManager(api.dataDir)
opID, err := opsMgr.Start(instanceName, "bootstrap", req.Node)
// Bootstrap with progress tracking
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
opID, err := clusterMgr.Bootstrap(instanceName, req.Node)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to start operation: %v", err))
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to start bootstrap: %v", err))
return
}
// Bootstrap in background
go func() {
clusterMgr := cluster.NewManager(api.dataDir)
_ = opsMgr.UpdateStatus(instanceName, opID, "running")
if err := clusterMgr.Bootstrap(instanceName, req.Node); err != nil {
_ = opsMgr.Update(instanceName, opID, "failed", err.Error(), 0)
} else {
_ = opsMgr.Update(instanceName, opID, "completed", "Bootstrap completed", 100)
}
}()
respondJSON(w, http.StatusAccepted, map[string]string{
"operation_id": opID,
"message": "Bootstrap initiated",
@@ -138,7 +126,7 @@ func (api *API) ClusterConfigureEndpoints(w http.ResponseWriter, r *http.Request
}
// Configure endpoints
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
if err := clusterMgr.ConfigureEndpoints(instanceName, req.IncludeNodes); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to configure endpoints: %v", err))
return
@@ -161,7 +149,7 @@ func (api *API) ClusterGetStatus(w http.ResponseWriter, r *http.Request) {
}
// Get status
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
status, err := clusterMgr.GetStatus(instanceName)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to get status: %v", err))
@@ -183,7 +171,7 @@ func (api *API) ClusterHealth(w http.ResponseWriter, r *http.Request) {
}
// Get health checks
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
checks, err := clusterMgr.Health(instanceName)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to get health: %v", err))
@@ -219,7 +207,7 @@ func (api *API) ClusterGetKubeconfig(w http.ResponseWriter, r *http.Request) {
}
// Get kubeconfig
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
kubeconfig, err := clusterMgr.GetKubeconfig(instanceName)
if err != nil {
respondError(w, http.StatusNotFound, fmt.Sprintf("Kubeconfig not found: %v", err))
@@ -243,7 +231,7 @@ func (api *API) ClusterGenerateKubeconfig(w http.ResponseWriter, r *http.Request
}
// Regenerate kubeconfig from cluster
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
if err := clusterMgr.RegenerateKubeconfig(instanceName); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to generate kubeconfig: %v", err))
return
@@ -266,7 +254,7 @@ func (api *API) ClusterGetTalosconfig(w http.ResponseWriter, r *http.Request) {
}
// Get talosconfig
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
talosconfig, err := clusterMgr.GetTalosconfig(instanceName)
if err != nil {
respondError(w, http.StatusNotFound, fmt.Sprintf("Talosconfig not found: %v", err))
@@ -314,7 +302,7 @@ func (api *API) ClusterReset(w http.ResponseWriter, r *http.Request) {
// Reset in background
go func() {
clusterMgr := cluster.NewManager(api.dataDir)
clusterMgr := cluster.NewManager(api.dataDir, api.opsMgr)
_ = opsMgr.UpdateStatus(instanceName, opID, "running")
if err := clusterMgr.Reset(instanceName, req.Confirm); err != nil {

View File

@@ -12,6 +12,7 @@ import (
)
// NodeDiscover initiates node discovery
// Accepts optional subnet parameter. If no subnet provided, auto-detects local networks.
func (api *API) NodeDiscover(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
instanceName := vars["name"]
@@ -22,10 +23,9 @@ func (api *API) NodeDiscover(w http.ResponseWriter, r *http.Request) {
return
}
// Parse request body - support both subnet and ip_list formats
// Parse request body - only subnet is supported
var req struct {
Subnet string `json:"subnet"`
IPList []string `json:"ip_list"`
Subnet string `json:"subnet,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@@ -33,16 +33,38 @@ func (api *API) NodeDiscover(w http.ResponseWriter, r *http.Request) {
return
}
// If subnet provided, use it as a single "IP" for discovery
// The discovery manager will scan this subnet
// Build IP list
var ipList []string
var err error
if req.Subnet != "" {
ipList = []string{req.Subnet}
} else if len(req.IPList) > 0 {
ipList = req.IPList
// Expand provided CIDR notation to individual IPs
ipList, err = discovery.ExpandSubnet(req.Subnet)
if err != nil {
respondError(w, http.StatusBadRequest, fmt.Sprintf("Invalid subnet: %v", err))
return
}
} else {
respondError(w, http.StatusBadRequest, "subnet or ip_list is required")
return
// Auto-detect: Get local networks when no subnet provided
networks, err := discovery.GetLocalNetworks()
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to detect local networks: %v", err))
return
}
if len(networks) == 0 {
respondError(w, http.StatusNotFound, "No local networks found")
return
}
// Expand all detected networks
for _, network := range networks {
ips, err := discovery.ExpandSubnet(network)
if err != nil {
continue // Skip invalid networks
}
ipList = append(ipList, ips...)
}
}
// Start discovery
@@ -52,9 +74,10 @@ func (api *API) NodeDiscover(w http.ResponseWriter, r *http.Request) {
return
}
respondJSON(w, http.StatusAccepted, map[string]string{
"message": "Discovery started",
"status": "running",
respondJSON(w, http.StatusAccepted, map[string]interface{}{
"message": "Discovery started",
"status": "running",
"ips_to_scan": len(ipList),
})
}
@@ -92,7 +115,7 @@ func (api *API) NodeHardware(w http.ResponseWriter, r *http.Request) {
}
// Detect hardware
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
hwInfo, err := nodeMgr.DetectHardware(nodeIP)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to detect hardware: %v", err))
@@ -103,6 +126,7 @@ func (api *API) NodeHardware(w http.ResponseWriter, r *http.Request) {
}
// NodeDetect detects hardware on a single node (POST with IP in body)
// IP address is required.
func (api *API) NodeDetect(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
instanceName := vars["name"]
@@ -123,13 +147,14 @@ func (api *API) NodeDetect(w http.ResponseWriter, r *http.Request) {
return
}
// Validate IP is provided
if req.IP == "" {
respondError(w, http.StatusBadRequest, "ip is required")
respondError(w, http.StatusBadRequest, "IP address is required")
return
}
// Detect hardware
nodeMgr := node.NewManager(api.dataDir)
// Detect hardware for specific IP
nodeMgr := node.NewManager(api.dataDir, instanceName)
hwInfo, err := nodeMgr.DetectHardware(req.IP)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to detect hardware: %v", err))
@@ -158,7 +183,7 @@ func (api *API) NodeAdd(w http.ResponseWriter, r *http.Request) {
}
// Add node
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.Add(instanceName, &nodeData); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to add node: %v", err))
return
@@ -182,7 +207,7 @@ func (api *API) NodeList(w http.ResponseWriter, r *http.Request) {
}
// List nodes
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
nodes, err := nodeMgr.List(instanceName)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to list nodes: %v", err))
@@ -207,7 +232,7 @@ func (api *API) NodeGet(w http.ResponseWriter, r *http.Request) {
}
// Get node
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
nodeData, err := nodeMgr.Get(instanceName, nodeIdentifier)
if err != nil {
respondError(w, http.StatusNotFound, fmt.Sprintf("Node not found: %v", err))
@@ -233,7 +258,7 @@ func (api *API) NodeApply(w http.ResponseWriter, r *http.Request) {
opts := node.ApplyOptions{}
// Apply node configuration
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.Apply(instanceName, nodeIdentifier, opts); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to apply node configuration: %v", err))
return
@@ -265,7 +290,7 @@ func (api *API) NodeUpdate(w http.ResponseWriter, r *http.Request) {
}
// Update node
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.Update(instanceName, nodeIdentifier, updates); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update node: %v", err))
return
@@ -289,7 +314,7 @@ func (api *API) NodeFetchTemplates(w http.ResponseWriter, r *http.Request) {
}
// Fetch templates
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.FetchTemplates(instanceName); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to fetch templates: %v", err))
return
@@ -313,7 +338,7 @@ func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) {
}
// Delete node
nodeMgr := node.NewManager(api.dataDir)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.Delete(instanceName, nodeIdentifier); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete node: %v", err))
return
@@ -323,3 +348,26 @@ func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) {
"message": "Node deleted successfully",
})
}
// NodeDiscoveryCancel cancels an in-progress discovery operation
func (api *API) NodeDiscoveryCancel(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
instanceName := vars["name"]
// Validate instance exists
if err := api.instance.ValidateInstance(instanceName); err != nil {
respondError(w, http.StatusNotFound, fmt.Sprintf("Instance not found: %v", err))
return
}
// Cancel discovery
discoveryMgr := discovery.NewManager(api.dataDir, instanceName)
if err := discoveryMgr.CancelDiscovery(instanceName); err != nil {
respondError(w, http.StatusBadRequest, fmt.Sprintf("Failed to cancel discovery: %v", err))
return
}
respondJSON(w, http.StatusOK, map[string]string{
"message": "Discovery cancelled successfully",
})
}