Node delete should reset.

This commit is contained in:
2025-11-09 00:15:36 +00:00
parent 1271eebf38
commit c8fd702d1b
3 changed files with 75 additions and 16 deletions

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"strings"
"github.com/gorilla/mux"
@@ -326,6 +327,7 @@ func (api *API) NodeFetchTemplates(w http.ResponseWriter, r *http.Request) {
}
// NodeDelete removes a node
// Query parameter: skip_reset=true to force delete without resetting
func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
instanceName := vars["name"]
@@ -337,15 +339,29 @@ func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) {
return
}
// Delete node
// Parse skip_reset query parameter (default: false)
skipReset := r.URL.Query().Get("skip_reset") == "true"
// Delete node (with reset unless skipReset=true)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.Delete(instanceName, nodeIdentifier); err != nil {
if err := nodeMgr.Delete(instanceName, nodeIdentifier, skipReset); err != nil {
// Check if it's a reset-related error
errMsg := err.Error()
if !skipReset && (strings.Contains(errMsg, "reset") || strings.Contains(errMsg, "timed out")) {
respondError(w, http.StatusConflict, errMsg)
return
}
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete node: %v", err))
return
}
message := "Node deleted successfully"
if !skipReset {
message = "Node reset and removed successfully"
}
respondJSON(w, http.StatusOK, map[string]string{
"message": "Node deleted successfully",
"message": message,
})
}

View File

@@ -49,14 +49,14 @@ type NodeStatus struct {
// ClusterStatus represents cluster health and status
type ClusterStatus struct {
Status string `json:"status"` // ready, pending, error
Nodes int `json:"nodes"`
ControlPlaneNodes int `json:"control_plane_nodes"`
WorkerNodes int `json:"worker_nodes"`
KubernetesVersion string `json:"kubernetes_version"`
TalosVersion string `json:"talos_version"`
Services map[string]string `json:"services"`
NodeStatuses map[string]NodeStatus `json:"node_statuses,omitempty"`
Status string `json:"status"` // ready, pending, error
Nodes int `json:"nodes"`
ControlPlaneNodes int `json:"control_plane_nodes"`
WorkerNodes int `json:"worker_nodes"`
KubernetesVersion string `json:"kubernetes_version"`
TalosVersion string `json:"talos_version"`
Services map[string]string `json:"services"`
NodeStatuses map[string]NodeStatus `json:"node_statuses,omitempty"`
}
// GetTalosDir returns the talos directory for an instance

View File

@@ -1,11 +1,13 @@
package node
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/wild-cloud/wild-central/daemon/internal/config"
"github.com/wild-cloud/wild-central/daemon/internal/setup"
@@ -254,25 +256,53 @@ func (m *Manager) Add(instanceName string, node *Node) error {
}
// Delete removes a node from config.yaml
func (m *Manager) Delete(instanceName, nodeIdentifier string) error {
// If skipReset is false, the node will be reset before deletion (with 30s timeout)
func (m *Manager) Delete(instanceName, nodeIdentifier string, skipReset bool) error {
// Get node to find hostname
node, err := m.Get(instanceName, nodeIdentifier)
if err != nil {
return err
}
// Reset node first unless skipReset is true
if !skipReset {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Use goroutine to respect context timeout
done := make(chan error, 1)
go func() {
done <- m.Reset(instanceName, nodeIdentifier)
}()
select {
case err := <-done:
if err != nil {
return fmt.Errorf("failed to reset node before deletion (use skip_reset=true to force delete): %w", err)
}
case <-ctx.Done():
return fmt.Errorf("node reset timed out after 30 seconds (use skip_reset=true to force delete)")
}
}
// Delete node from config.yaml
return m.deleteFromConfig(instanceName, node.Hostname)
}
// deleteFromConfig removes a node entry from config.yaml
func (m *Manager) deleteFromConfig(instanceName, hostname string) error {
instancePath := m.GetInstancePath(instanceName)
configPath := filepath.Join(instancePath, "config.yaml")
// Delete node from config.yaml
// Path: .cluster.nodes.active["hostname"]
// Use bracket notation to safely handle hostnames with special characters
nodePath := fmt.Sprintf(".cluster.nodes.active[\"%s\"]", node.Hostname)
nodePath := fmt.Sprintf(".cluster.nodes.active[\"%s\"]", hostname)
yq := tools.NewYQ()
// Use yq to delete the node
delExpr := fmt.Sprintf("del(%s)", nodePath)
_, err = yq.Exec("eval", "-i", delExpr, configPath)
_, err := yq.Exec("eval", "-i", delExpr, configPath)
if err != nil {
return fmt.Errorf("failed to delete node: %w", err)
}
@@ -700,10 +730,18 @@ func (m *Manager) Reset(instanceName, nodeIdentifier string) error {
cmd := exec.Command("talosctl", "-n", resetIP, "--talosconfig", talosconfigPath, "reset", "--graceful=false", "--reboot")
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to reset node: %w\nOutput: %s", err, string(output))
// Check if error is due to node rebooting (expected after reset command)
outputStr := string(output)
if strings.Contains(outputStr, "connection refused") || strings.Contains(outputStr, "Unavailable") {
// This is expected - node is rebooting after successful reset
// Continue with config cleanup
} else {
// Real error - return it
return fmt.Errorf("failed to reset node: %w\nOutput: %s", err, outputStr)
}
}
// Update node status to maintenance mode
// Update node status to maintenance mode, then remove from config
node.Maintenance = true
node.Configured = false
node.Applied = false
@@ -711,5 +749,10 @@ func (m *Manager) Reset(instanceName, nodeIdentifier string) error {
return fmt.Errorf("failed to update node status: %w", err)
}
// Remove node from config.yaml after successful reset
if err := m.deleteFromConfig(instanceName, node.Hostname); err != nil {
return fmt.Errorf("failed to remove node from config: %w", err)
}
return nil
}