From c8fd702d1bc4431f90ce5df416a27a914edd5076 Mon Sep 17 00:00:00 2001 From: Paul Payne Date: Sun, 9 Nov 2025 00:15:36 +0000 Subject: [PATCH] Node delete should reset. --- internal/api/v1/handlers_node.go | 22 +++++++++++-- internal/cluster/cluster.go | 16 +++++----- internal/node/node.go | 53 +++++++++++++++++++++++++++++--- 3 files changed, 75 insertions(+), 16 deletions(-) diff --git a/internal/api/v1/handlers_node.go b/internal/api/v1/handlers_node.go index f362cf9..ce409bd 100644 --- a/internal/api/v1/handlers_node.go +++ b/internal/api/v1/handlers_node.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "strings" "github.com/gorilla/mux" @@ -326,6 +327,7 @@ func (api *API) NodeFetchTemplates(w http.ResponseWriter, r *http.Request) { } // NodeDelete removes a node +// Query parameter: skip_reset=true to force delete without resetting func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) instanceName := vars["name"] @@ -337,15 +339,29 @@ func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) { return } - // Delete node + // Parse skip_reset query parameter (default: false) + skipReset := r.URL.Query().Get("skip_reset") == "true" + + // Delete node (with reset unless skipReset=true) nodeMgr := node.NewManager(api.dataDir, instanceName) - if err := nodeMgr.Delete(instanceName, nodeIdentifier); err != nil { + if err := nodeMgr.Delete(instanceName, nodeIdentifier, skipReset); err != nil { + // Check if it's a reset-related error + errMsg := err.Error() + if !skipReset && (strings.Contains(errMsg, "reset") || strings.Contains(errMsg, "timed out")) { + respondError(w, http.StatusConflict, errMsg) + return + } respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete node: %v", err)) return } + message := "Node deleted successfully" + if !skipReset { + message = "Node reset and removed successfully" + } + respondJSON(w, http.StatusOK, map[string]string{ - "message": "Node deleted successfully", + "message": message, }) } diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index a9bc102..0e2419b 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -49,14 +49,14 @@ type NodeStatus struct { // ClusterStatus represents cluster health and status type ClusterStatus struct { - Status string `json:"status"` // ready, pending, error - Nodes int `json:"nodes"` - ControlPlaneNodes int `json:"control_plane_nodes"` - WorkerNodes int `json:"worker_nodes"` - KubernetesVersion string `json:"kubernetes_version"` - TalosVersion string `json:"talos_version"` - Services map[string]string `json:"services"` - NodeStatuses map[string]NodeStatus `json:"node_statuses,omitempty"` + Status string `json:"status"` // ready, pending, error + Nodes int `json:"nodes"` + ControlPlaneNodes int `json:"control_plane_nodes"` + WorkerNodes int `json:"worker_nodes"` + KubernetesVersion string `json:"kubernetes_version"` + TalosVersion string `json:"talos_version"` + Services map[string]string `json:"services"` + NodeStatuses map[string]NodeStatus `json:"node_statuses,omitempty"` } // GetTalosDir returns the talos directory for an instance diff --git a/internal/node/node.go b/internal/node/node.go index 80006a2..bb96415 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -1,11 +1,13 @@ package node import ( + "context" "fmt" "os" "os/exec" "path/filepath" "strings" + "time" "github.com/wild-cloud/wild-central/daemon/internal/config" "github.com/wild-cloud/wild-central/daemon/internal/setup" @@ -254,25 +256,53 @@ func (m *Manager) Add(instanceName string, node *Node) error { } // Delete removes a node from config.yaml -func (m *Manager) Delete(instanceName, nodeIdentifier string) error { +// If skipReset is false, the node will be reset before deletion (with 30s timeout) +func (m *Manager) Delete(instanceName, nodeIdentifier string, skipReset bool) error { // Get node to find hostname node, err := m.Get(instanceName, nodeIdentifier) if err != nil { return err } + // Reset node first unless skipReset is true + if !skipReset { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // Use goroutine to respect context timeout + done := make(chan error, 1) + go func() { + done <- m.Reset(instanceName, nodeIdentifier) + }() + + select { + case err := <-done: + if err != nil { + return fmt.Errorf("failed to reset node before deletion (use skip_reset=true to force delete): %w", err) + } + case <-ctx.Done(): + return fmt.Errorf("node reset timed out after 30 seconds (use skip_reset=true to force delete)") + } + } + + // Delete node from config.yaml + return m.deleteFromConfig(instanceName, node.Hostname) +} + +// deleteFromConfig removes a node entry from config.yaml +func (m *Manager) deleteFromConfig(instanceName, hostname string) error { instancePath := m.GetInstancePath(instanceName) configPath := filepath.Join(instancePath, "config.yaml") // Delete node from config.yaml // Path: .cluster.nodes.active["hostname"] // Use bracket notation to safely handle hostnames with special characters - nodePath := fmt.Sprintf(".cluster.nodes.active[\"%s\"]", node.Hostname) + nodePath := fmt.Sprintf(".cluster.nodes.active[\"%s\"]", hostname) yq := tools.NewYQ() // Use yq to delete the node delExpr := fmt.Sprintf("del(%s)", nodePath) - _, err = yq.Exec("eval", "-i", delExpr, configPath) + _, err := yq.Exec("eval", "-i", delExpr, configPath) if err != nil { return fmt.Errorf("failed to delete node: %w", err) } @@ -700,10 +730,18 @@ func (m *Manager) Reset(instanceName, nodeIdentifier string) error { cmd := exec.Command("talosctl", "-n", resetIP, "--talosconfig", talosconfigPath, "reset", "--graceful=false", "--reboot") output, err := cmd.CombinedOutput() if err != nil { - return fmt.Errorf("failed to reset node: %w\nOutput: %s", err, string(output)) + // Check if error is due to node rebooting (expected after reset command) + outputStr := string(output) + if strings.Contains(outputStr, "connection refused") || strings.Contains(outputStr, "Unavailable") { + // This is expected - node is rebooting after successful reset + // Continue with config cleanup + } else { + // Real error - return it + return fmt.Errorf("failed to reset node: %w\nOutput: %s", err, outputStr) + } } - // Update node status to maintenance mode + // Update node status to maintenance mode, then remove from config node.Maintenance = true node.Configured = false node.Applied = false @@ -711,5 +749,10 @@ func (m *Manager) Reset(instanceName, nodeIdentifier string) error { return fmt.Errorf("failed to update node status: %w", err) } + // Remove node from config.yaml after successful reset + if err := m.deleteFromConfig(instanceName, node.Hostname); err != nil { + return fmt.Errorf("failed to remove node from config: %w", err) + } + return nil }