Node health monitoring.

Adds node reboot.
feat: Add cluster config backup and move schedules to per-app backup pages
2026-05-25 07:35:53 +00:00 · 2026-05-25 07:26:29 +00:00 · 2026-05-24 21:54:46 +00:00 · 2026-05-24 21:54:13 +00:00 · 2026-05-24 21:54:11 +00:00 · 2026-05-24 21:52:59 +00:00
116 changed files with 6695 additions and 2089 deletions
--- a/api/.air.toml
+++ b/api/.air.toml
@@ -0,0 +1,19 @@
+root = "."
+tmp_dir = "tmp"
+
+[build]
+  bin = "./tmp/wildd"
+  cmd = "go build -o ./tmp/wildd ."
+  delay = 1000
+  exclude_dir = ["tmp", "build", "dist", "vendor"]
+  exclude_regex = ["_test.go$"]
+  include_ext = ["go", "yaml"]
+  kill_delay = "0s"
+  send_interrupt = true
+  stop_on_error = true
+
+[log]
+  time = false
+
+[misc]
+  clean_on_exit = true
--- a/api/.gitignore
+++ b/api/.gitignore
@@ -22,3 +22,6 @@ __debug*
 # Go workspace file
 go.work
 go.work.sum
+
+# Air live-reload
+tmp/
--- a/api/Makefile
+++ b/api/Makefile
@@ -29,8 +29,14 @@ build: ## Build the daemon binary
 	$(GOBUILD) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY_NAME) .

 dev: ## Run the daemon in development mode with live reloading
-	@echo "Starting $(BINARY_NAME) in development mode..."
-	$(GOCMD) run .
+	@if command -v air >/dev/null 2>&1; then \
+		echo "Starting $(BINARY_NAME) in development mode with live reloading (air)..."; \
+		air; \
+	else \
+		echo "air not found. Install it for live reloading: go install github.com/air-verse/air@latest"; \
+		echo "Starting $(BINARY_NAME) in development mode without live reloading..."; \
+		$(GOCMD) run .; \
+	fi

 test: ## Run tests
 	@echo "Running tests..."
--- a/api/README.md
+++ b/api/README.md
@@ -4,7 +4,7 @@ The Wild Central API is a lightweight service that runs on a local machine (e.g.

 ## Development

-Start the development server:
+Start the development server with live reloading:

 ```bash
 make dev
@@ -12,6 +12,14 @@ make dev

 The API will be available at `http://localhost:5055`.

+`make dev` uses [air](https://github.com/air-verse/air) to automatically rebuild and restart the server when `.go` or `.yaml` files change. Install it with:
+
+```bash
+go install github.com/air-verse/air@latest
+```
+
+If `air` is not installed, `make dev` falls back to `go run .` (no live reloading).
+
 ### Environment Variables

 - `WILD_API_DATA_DIR` - Directory for instance data (default: `/var/lib/wild-central`)
--- a/api/internal/api/v1/async.go
+++ b/api/internal/api/v1/async.go
@@ -2,7 +2,7 @@ package v1

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"

 	"github.com/wild-cloud/wild-central/daemon/internal/operations"
@@ -38,7 +38,7 @@ func (api *API) StartAsyncOperation(
 		// Always recover from panics to prevent goroutine crashes from taking down the server
 		defer func() {
 			if r := recover(); r != nil {
-				log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
+				slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
 				_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
 			}
 		}()
@@ -71,7 +71,7 @@ func (api *API) StartAsyncOperationWithMessage(
 	go func() {
 		defer func() {
 			if r := recover(); r != nil {
-				log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
+				slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
 				_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
 			}
 		}()
@@ -105,7 +105,7 @@ func (api *API) StartAsyncOperationWithBroadcaster(
 	go func() {
 		defer func() {
 			if r := recover(); r != nil {
-				log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
+				slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
 				_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
 			}
 		}()
--- a/api/internal/api/v1/handlers.go
+++ b/api/internal/api/v1/handlers.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"
 	"time"
@@ -25,18 +25,18 @@ import (

 // API holds all dependencies for API handlers
 type API struct {
-	dataDir         string
-	appsDir         string // Path to external apps directory
-	config          *config.Manager
-	secrets         *secrets.Manager
-	context         *context.Manager
-	instance        *instance.Manager
-	dnsmasq         *dnsmasq.ConfigGenerator
-	opsMgr          *operations.Manager     // Operations manager
-	broadcaster     *operations.Broadcaster // SSE broadcaster for operation output
-	sseManager      *sse.Manager            // SSE manager for real-time events
-	watcherManager  *sse.WatcherManager     // Manager for kubectl/talos watchers
-	factory         *factory.Client         // Talos Image Factory client
+	dataDir        string
+	appsDir        string // Path to external apps directory
+	config         *config.Manager
+	secrets        *secrets.Manager
+	context        *context.Manager
+	instance       *instance.Manager
+	dnsmasq        *dnsmasq.ConfigGenerator
+	opsMgr         *operations.Manager     // Operations manager
+	broadcaster    *operations.Broadcaster // SSE broadcaster for operation output
+	sseManager     *sse.Manager            // SSE manager for real-time events
+	watcherManager *sse.WatcherManager     // Manager for kubectl/talos watchers
+	factory        *factory.Client         // Talos Image Factory client
 }

 // NewAPI creates a new API handler with all dependencies
@@ -59,7 +59,7 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
 	dnsmasqConfigPath := "/etc/dnsmasq.d/wild-cloud.conf"
 	if os.Getenv("WILD_API_DNSMASQ_CONFIG_PATH") != "" {
 		dnsmasqConfigPath = os.Getenv("WILD_API_DNSMASQ_CONFIG_PATH")
-		log.Printf("Using custom dnsmasq config path: %s", dnsmasqConfigPath)
+		slog.Info("using custom dnsmasq config path", "path", dnsmasqConfigPath)
 	}

 	// Create SSE manager for real-time events
@@ -73,24 +73,23 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
 	opsMgr.SetSSEManager(adapter)

 	api := &API{
-		dataDir:         dataDir,
-		appsDir:         appsDir,
-		config:          configMgr,
-		secrets:         secrets.NewManager(),
-		context:         context.NewManager(dataDir),
-		instance:        instance.NewManager(dataDir),
-		dnsmasq:         dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
-		opsMgr:          opsMgr,
-		broadcaster:     operations.NewBroadcaster(),
-		sseManager:      sseManager,
-		watcherManager:  watcherManager,
-		factory:         factory.NewClient(),
+		dataDir:        dataDir,
+		appsDir:        appsDir,
+		config:         configMgr,
+		secrets:        secrets.NewManager(),
+		context:        context.NewManager(dataDir),
+		instance:       instance.NewManager(dataDir),
+		dnsmasq:        dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
+		opsMgr:         opsMgr,
+		broadcaster:    operations.NewBroadcaster(),
+		sseManager:     sseManager,
+		watcherManager: watcherManager,
+		factory:        factory.NewClient(),
 	}

 	return api, nil
 }

-
 // StartCentralStatusBroadcaster starts periodic broadcasting of central status
 func (api *API) StartCentralStatusBroadcaster(startTime time.Time) {
 	go func() {
@@ -107,6 +106,8 @@ func (api *API) StartCentralStatusBroadcaster(startTime time.Time) {
 }

 func (api *API) RegisterRoutes(r *mux.Router) {
+	// Request logging middleware (runs first, wraps everything)
+	r.Use(RequestLoggingMiddleware)
 	// Apply instance validation middleware to all routes with {name} parameter
 	r.Use(api.ValidateInstanceMiddleware)

@@ -145,6 +146,8 @@ func (api *API) RegisterRoutes(r *mux.Router) {
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}", api.NodeGet).Methods("GET")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}", api.NodeUpdate).Methods("PUT")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/apply", api.NodeApply).Methods("POST")
+	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/health", api.NodeHealth).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/reboot", api.NodeReboot).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/reset", api.NodeReset).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/upgrade", api.NodeUpgrade).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/rollback", api.NodeRollback).Methods("POST")
@@ -229,6 +232,11 @@ func (api *API) RegisterRoutes(r *mux.Router) {
 	r.HandleFunc("/api/v1/instances/{name}/apps/{app}/restore/cleanup", api.BackupAppCleanup).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/apps/{app}/restore/plan", api.BackupAppRecoveryPlan).Methods("GET")

+	// Backup & Restore - Cluster Config
+	r.HandleFunc("/api/v1/instances/{name}/backup/cluster", api.BackupClusterStart).Methods("POST")
+	r.HandleFunc("/api/v1/instances/{name}/backup/cluster", api.BackupClusterList).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/backup/cluster/{timestamp}", api.BackupClusterDelete).Methods("DELETE")
+
 	// Backup Schedules
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules", api.BackupScheduleList).Methods("GET")
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules", api.BackupScheduleCreate).Methods("POST")
@@ -236,8 +244,10 @@ func (api *API) RegisterRoutes(r *mux.Router) {
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules/{scheduleId}", api.BackupScheduleDelete).Methods("DELETE")
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules/{scheduleId}/run", api.BackupScheduleRun).Methods("POST")

-	// Backup Health
+	// Backup Health & Configuration
 	r.HandleFunc("/api/v1/instances/{name}/backup/health", api.BackupHealth).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/backup/config", api.BackupConfigGet).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/backup/config", api.BackupConfigUpdate).Methods("PUT")

 	// Global Configuration
 	r.HandleFunc("/api/v1/config", api.GetGlobalConfig).Methods("GET")
@@ -299,7 +309,7 @@ func (api *API) CreateInstance(w http.ResponseWriter, r *http.Request) {
 	}

 	if err := api.updateDnsmasqForAllInstances(); err != nil {
-		log.Printf("Warning: Could not update dnsmasq configuration: %v", err)
+		slog.Error("dnsmasq config update failed", "error", err)
 		response["warning"] = fmt.Sprintf("dnsmasq update failed: %v. Use POST /api/v1/dnsmasq/update to retry.", err)
 	}

@@ -387,7 +397,7 @@ func (api *API) GetConfig(w http.ResponseWriter, r *http.Request) {
 		// Return raw YAML
 		w.Header().Set("Content-Type", "application/yaml")
 		w.WriteHeader(http.StatusOK)
-		w.Write(configData)
+		_, _ = w.Write(configData)
 		return
 	}

--- a/api/internal/api/v1/handlers_apps.go
+++ b/api/internal/api/v1/handlers_apps.go
@@ -385,7 +385,7 @@ func (api *API) AppsGetReadme(w http.ResponseWriter, r *http.Request) {
 	content, err := os.ReadFile(instancePath)
 	if err == nil {
 		w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
-		w.Write(content)
+		_, _ = w.Write(content)
 		return
 	}

@@ -402,7 +402,7 @@ func (api *API) AppsGetReadme(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
-	w.Write(content)
+	_, _ = w.Write(content)
 }

 // AppsGetManifest returns the manifest for an available app
@@ -440,7 +440,7 @@ func (api *API) AppsGetAvailableReadme(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
-	w.Write(content)
+	_, _ = w.Write(content)
 }

 // AppsCompile recompiles an app's templates
@@ -487,7 +487,7 @@ func (api *API) AppsGetManifests(w http.ResponseWriter, r *http.Request) {
 	var manifest apps.AppManifest
 	manifestPath := filepath.Join(appDir, "manifest.yaml")
 	if data, err := os.ReadFile(manifestPath); err == nil {
-		yaml.Unmarshal(data, &manifest)
+		_ = yaml.Unmarshal(data, &manifest)
 	}

 	// Build list of kustomize directories to render
@@ -523,7 +523,7 @@ func (api *API) AppsGetManifests(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
-	w.Write(allOutput)
+	_, _ = w.Write(allOutput)
 }

 // AppsRunScript runs a named script defined in the app's manifest
--- a/api/internal/api/v1/handlers_backup.go
+++ b/api/internal/api/v1/handlers_backup.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
@@ -370,7 +371,7 @@ func (api *API) BackupAppDelete(w http.ResponseWriter, r *http.Request) {
 				"app": appName,
 			},
 		})
-		respondError(w, http.StatusInternalServerError, "Failed to delete backup")
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete backup: %v", err))
 		return
 	}

@@ -454,12 +455,12 @@ func (api *API) BackupAppVerify(w http.ResponseWriter, r *http.Request) {

 // BackupResourceInfo contains information about a discovered backup resource
 type BackupResourceInfo struct {
-	Name         string                 `json:"name"`
-	Type         string                 `json:"type"`  // "database", "pvc", "secret"
-	Plugin       string                 `json:"plugin"` // "postgres", "mysql", "longhorn-pvc", etc.
+	Name         string         `json:"name"`
+	Type         string         `json:"type"`   // "database", "pvc", "secret"
+	Plugin       string         `json:"plugin"` // "postgres", "mysql", "longhorn-pvc", etc.
 	Source       map[string]any `json:"source"` // Resource-specific info
-	ShouldBackup bool                   `json:"shouldBackup"`
-	Reason       string                 `json:"reason,omitempty"` // Why it's included/excluded
+	ShouldBackup bool           `json:"shouldBackup"`
+	Reason       string         `json:"reason,omitempty"` // Why it's included/excluded
 }

 // BackupAppDiscoverResources auto-discovers backup resources for an app
@@ -590,9 +591,9 @@ func parsePVC(pvc map[string]any) BackupResourceInfo {
 	}

 	return BackupResourceInfo{
-		Name:         name,
-		Type:         "pvc",
-		Plugin:       plugin,
+		Name:   name,
+		Type:   "pvc",
+		Plugin: plugin,
 		Source: map[string]any{
 			"pvcName":      name,
 			"storageClass": storageClass,
@@ -635,9 +636,9 @@ func parseVolumeClaimTemplate(vct map[string]any, statefulSetName string) Backup
 	}

 	return BackupResourceInfo{
-		Name:         pvcName,
-		Type:         "pvc",
-		Plugin:       detectStoragePlugin(storageClass),
+		Name:   pvcName,
+		Type:   "pvc",
+		Plugin: detectStoragePlugin(storageClass),
 		Source: map[string]any{
 			"pvcName":      pvcName,
 			"storageClass": storageClass,
@@ -684,7 +685,7 @@ func discoverDatabases(dataDir, instanceName, appName, manifestPath string) []Ba
 		configPath := tools.GetInstanceConfigPath(dataDir, instanceName)
 		configData, _ := os.ReadFile(configPath)
 		var config map[string]any
-		yaml.Unmarshal(configData, &config)
+		_ = yaml.Unmarshal(configData, &config)

 		appConfig := map[string]any{}
 		if apps, ok := config["apps"].(map[string]any); ok {
@@ -998,13 +999,18 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	if sched.TargetType != "app" {
-		respondError(w, http.StatusBadRequest, "Only app schedules can be triggered manually")
+	if sched.TargetType != "app" && sched.TargetType != "cluster" {
+		respondError(w, http.StatusBadRequest, "Unsupported schedule target type")
 		return
 	}

+	opTarget := sched.TargetName
+	if sched.TargetType == "cluster" {
+		opTarget = "_cluster"
+	}
+
 	// Run as async operation
-	api.StartAsyncOperation(w, instanceName, "backup", sched.TargetName,
+	api.StartAsyncOperation(w, instanceName, "backup", opTarget,
 		func(opsMgr *operations.Manager, opID string) error {
 			_ = opsMgr.UpdateProgress(instanceName, opID, 10, "Starting scheduled backup")

@@ -1013,7 +1019,13 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
 			}

 			mgr := backup.NewManagerWithProgress(api.dataDir, progressCallback)
-			_, err := mgr.BackupApp(instanceName, sched.TargetName)
+
+			var err error
+			if sched.TargetType == "cluster" {
+				_, err = mgr.BackupClusterConfig(instanceName)
+			} else {
+				_, err = mgr.BackupApp(instanceName, sched.TargetName)
+			}

 			if err == nil {
 				// Update lastRun and nextRun
@@ -1021,26 +1033,28 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
 				sched.LastRun = &now
 				next := backup.ComputeNextRun(sched, now)
 				sched.NextRun = &next
-				backup.SaveInstanceBackupSchedules(api.dataDir, instanceName, config.Schedules)
+				if err := backup.SaveInstanceBackupSchedules(api.dataDir, instanceName, config.Schedules); err != nil {
+					slog.Error("failed to save backup schedules", "instance", instanceName, "error", err)
+				}

 				api.sseManager.Broadcast(&sse.Event{
 					Type:         "backup:schedule:completed",
 					InstanceName: instanceName,
 					Data: map[string]any{
 						"scheduleId": scheduleID,
-						"app":        sched.TargetName,
+						"target":     opTarget,
 					},
 				})

-				// Enforce retention using schedule's policy
+				// Enforce retention
 				keepLast, keepDays := backup.RetentionFromSchedule(sched, config.Retention)
-				deleted, retErr := backup.EnforceRetention(mgr, instanceName, sched.TargetName, keepLast, keepDays)
+				deleted, retErr := backup.EnforceRetention(mgr, instanceName, opTarget, keepLast, keepDays)
 				if retErr == nil && deleted > 0 {
 					api.sseManager.Broadcast(&sse.Event{
 						Type:         "backup:retention:completed",
 						InstanceName: instanceName,
 						Data: map[string]any{
-							"app":     sched.TargetName,
+							"target":  opTarget,
 							"deleted": deleted,
 						},
 					})
@@ -1063,6 +1077,9 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {

 	mgr := backup.NewManager(api.dataDir)

+	// Compute default retention limit
+	defaultKeepLast, _ := backup.DefaultRetention(config.Retention)
+
 	// Get all apps with backups by scanning the backup directory
 	backupDir := mgr.GetBackupDir(instanceName)
 	appHealth := make(map[string]any)
@@ -1079,8 +1096,38 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
 			continue
 		}

+		// Compute total size across all backups for this app
+		var totalSize int64
+		for _, p := range plans {
+			for _, s := range p.Strategies {
+				if s.Backup != nil {
+					if size, ok := s.Backup["size"]; ok {
+						switch v := size.(type) {
+						case int64:
+							totalSize += v
+						case int:
+							totalSize += int64(v)
+						case float64:
+							totalSize += int64(v)
+						}
+					}
+				}
+			}
+		}
+
+		// Determine retention limit for this app (schedule override or default)
+		keepLast := defaultKeepLast
+		for _, sched := range config.Schedules {
+			if sched.TargetName == appName && sched.Enabled && sched.Retention != nil && sched.Retention.KeepLast > 0 {
+				keepLast = sched.Retention.KeepLast
+				break
+			}
+		}
+
 		info := map[string]any{
 			"backupCount": len(plans),
+			"retainCount": keepLast,
+			"totalSize":   totalSize,
 			"scheduled":   false,
 		}

@@ -1088,6 +1135,9 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
 			newest := plans[0]
 			info["lastBackup"] = newest.Timestamp
 			info["lastStatus"] = newest.Status
+			if newest.Version != "" {
+				info["lastVersion"] = newest.Version
+			}
 		}

 		// Check if this app has an active schedule
@@ -1135,3 +1185,180 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
 	})
 }

+// BackupConfigGet returns the current backup configuration (destination + retention)
+func (api *API) BackupConfigGet(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	config, err := backup.LoadInstanceBackupConfig(api.dataDir, instanceName)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "Failed to load backup config")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"data": map[string]any{
+			"destination":  config.Destination,
+			"retention":    config.Retention,
+			"verification": config.Verification,
+		},
+	})
+}
+
+// BackupConfigUpdate updates the backup destination and/or retention settings
+func (api *API) BackupConfigUpdate(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	var req struct {
+		Destination *backup.DestinationConfig `json:"destination"`
+		Retention   *backup.RetentionPolicy   `json:"retention"`
+	}
+
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		respondError(w, http.StatusBadRequest, "Invalid request body")
+		return
+	}
+
+	if req.Destination == nil && req.Retention == nil {
+		respondError(w, http.StatusBadRequest, "Must provide destination or retention to update")
+		return
+	}
+
+	// Validate destination type if provided
+	if req.Destination != nil {
+		switch req.Destination.Type {
+		case "local", "nfs", "s3", "azure":
+			// valid
+		default:
+			respondError(w, http.StatusBadRequest, fmt.Sprintf("Invalid destination type: %s", req.Destination.Type))
+			return
+		}
+	}
+
+	if err := backup.SaveInstanceBackupConfig(api.dataDir, instanceName, req.Destination, req.Retention); err != nil {
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to save backup config: %v", err))
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"message": "Backup configuration updated",
+	})
+}
+
+// BackupClusterStart starts a cluster config backup operation
+func (api *API) BackupClusterStart(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	api.sseManager.Broadcast(&sse.Event{
+		Type:         "backup:started",
+		InstanceName: instanceName,
+		Data: map[string]any{
+			"app": "_cluster",
+		},
+	})
+
+	api.StartAsyncOperation(w, instanceName, "backup", "_cluster",
+		func(opsMgr *operations.Manager, opID string) error {
+			_ = opsMgr.UpdateProgress(instanceName, opID, 10, "Starting cluster config backup")
+
+			progressCallback := func(progress int, message string) {
+				_ = opsMgr.UpdateProgress(instanceName, opID, progress, message)
+			}
+
+			mgr := backup.NewManagerWithProgress(api.dataDir, progressCallback)
+			_, err := mgr.BackupClusterConfig(instanceName)
+
+			if err != nil {
+				api.sseManager.Broadcast(&sse.Event{
+					Type:         "backup:failed",
+					InstanceName: instanceName,
+					Data: map[string]any{
+						"app":   "_cluster",
+						"error": err.Error(),
+					},
+				})
+			} else {
+				api.sseManager.Broadcast(&sse.Event{
+					Type:         "backup:completed",
+					InstanceName: instanceName,
+					Data: map[string]any{
+						"app": "_cluster",
+					},
+				})
+
+				// Enforce retention after successful backup
+				config, configErr := backup.LoadInstanceBackupConfig(api.dataDir, instanceName)
+				if configErr == nil {
+					keepLast, keepDays := backup.DefaultRetention(config.Retention)
+					deleted, retErr := backup.EnforceRetention(mgr, instanceName, "_cluster", keepLast, keepDays)
+					if retErr == nil && deleted > 0 {
+						api.sseManager.Broadcast(&sse.Event{
+							Type:         "backup:retention:completed",
+							InstanceName: instanceName,
+							Data: map[string]any{
+								"target":  "_cluster",
+								"deleted": deleted,
+							},
+						})
+					}
+				}
+			}
+
+			return err
+		})
+}
+
+// BackupClusterList lists all cluster config backups
+func (api *API) BackupClusterList(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	mgr := backup.NewManager(api.dataDir)
+	backups, err := mgr.ListBackups(instanceName, "_cluster")
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "Failed to list cluster backups")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"data": map[string]any{
+			"backups": backups,
+		},
+	})
+}
+
+// BackupClusterDelete deletes a specific cluster config backup
+func (api *API) BackupClusterDelete(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+	timestamp := mux.Vars(r)["timestamp"]
+
+	mgr := backup.NewManager(api.dataDir)
+	if err := mgr.DeleteAppBackup(instanceName, "_cluster", timestamp); err != nil {
+		api.sseManager.Broadcast(&sse.Event{
+			Type:         "backup:delete:failed",
+			InstanceName: instanceName,
+			Data: map[string]any{
+				"app":       "_cluster",
+				"timestamp": timestamp,
+				"error":     err.Error(),
+			},
+		})
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete backup: %v", err))
+		return
+	}
+
+	api.sseManager.Broadcast(&sse.Event{
+		Type:         "backup:deleted",
+		InstanceName: instanceName,
+		Data: map[string]any{
+			"app":       "_cluster",
+			"timestamp": timestamp,
+		},
+	})
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"message": "Cluster config backup deleted successfully",
+	})
+}
--- a/api/internal/api/v1/handlers_backup_test.go
+++ b/api/internal/api/v1/handlers_backup_test.go
@@ -740,10 +740,10 @@ func TestIsDatabase(t *testing.T) {
 // It verifies that we can find PVCs and StatefulSet volume claims in Kubernetes manifests
 func TestDiscoverFromKustomize(t *testing.T) {
 	tests := []struct {
-		name           string
-		kustomizeYAML  string
-		expectedCount  int
-		expectedFirst  BackupResourceInfo
+		name          string
+		kustomizeYAML string
+		expectedCount int
+		expectedFirst BackupResourceInfo
 	}{
 		{
 			name: "Discovers PVC as persistent state",
@@ -1115,4 +1115,4 @@ func TestBackupAppOperations(t *testing.T) {
 			assert.Equal(t, "Backup deleted successfully", response["message"])
 		}
 	})
-}
+}
--- a/api/internal/api/v1/handlers_config.go
+++ b/api/internal/api/v1/handlers_config.go
@@ -3,6 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net/http"

 	"github.com/wild-cloud/wild-central/daemon/internal/config"
@@ -43,6 +44,8 @@ func (api *API) ConfigUpdateBatch(w http.ResponseWriter, r *http.Request) {
 		updateCount++
 	}

+	slog.Info("config batch updated", "instance", instanceName, "keys", updateCount)
+
 	respondJSON(w, http.StatusOK, map[string]interface{}{
 		"message": "Configuration updated successfully",
 		"updated": updateCount,
@@ -87,6 +90,8 @@ func (api *API) UpdateGlobalConfig(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	slog.Info("global config updated")
+
 	respondJSON(w, http.StatusOK, map[string]interface{}{
 		"message": "Global configuration updated successfully",
 		"config":  globalCfg,
--- a/api/internal/api/v1/handlers_dnsmasq.go
+++ b/api/internal/api/v1/handlers_dnsmasq.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"

@@ -79,7 +79,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		instanceConfigPath := api.instance.GetInstanceConfigPath(name)
 		instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
 		if err != nil {
-			log.Printf("Warning: Could not load instance config for %s: %v", name, err)
+			slog.Error("failed to load instance config", "instance", name, "error", err)
 			continue
 		}
 		instanceConfigs = append(instanceConfigs, *instanceCfg)
@@ -95,7 +95,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		isFirstStart := err != nil || status.Status != "active"

 		// Update main dnsmasq configuration
-		log.Printf("Updating dnsmasq main configuration...")
+		slog.Info("updating dnsmasq main configuration")

 		// Write the main config
 		tempFile := api.dnsmasq.GetConfigPath() + ".tmp"
@@ -121,7 +121,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		// Write all instance configs
 		for i, name := range validInstanceNames {
 			if err := api.dnsmasq.WriteInstanceConfig(name, instanceConfigs[i]); err != nil {
-				log.Printf("Warning: Failed to write instance config for %s: %v", name, err)
+				slog.Error("failed to write instance DNS config", "instance", name, "error", err)
 			}
 		}

@@ -134,7 +134,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		// Configure system DNS to use local dnsmasq on first start
 		if isFirstStart {
 			if err := api.dnsmasq.ConfigureSystemDNS(); err != nil {
-				log.Printf("Warning: Failed to configure system DNS: %v", err)
+				slog.Error("failed to configure system DNS", "error", err)
 				// Don't fail the request - dnsmasq is still running
 			}
 		}
@@ -211,16 +211,14 @@ func (api *API) updateDnsmasqForAllInstances() error {

 	// Load all instance configs
 	var instanceConfigs []config.InstanceConfig
-	var validInstanceNames []string
 	for _, name := range instanceNames {
 		instanceConfigPath := api.instance.GetInstanceConfigPath(name)
 		instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
 		if err != nil {
-			log.Printf("Warning: Could not load instance config for %s: %v", name, err)
+			slog.Error("failed to load instance config", "instance", name, "error", err)
 			continue
 		}
 		instanceConfigs = append(instanceConfigs, *instanceCfg)
-		validInstanceNames = append(validInstanceNames, name)
 	}

 	// Regenerate and write dnsmasq config with restart
--- a/api/internal/api/v1/handlers_dnsmasq_test.go
+++ b/api/internal/api/v1/handlers_dnsmasq_test.go
@@ -42,7 +42,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
 	globalConfig.Cloud.Router.IP = "192.168.1.1"
 	configPath := filepath.Join(tmpDir, "config.yaml")
 	configData, _ := yaml.Marshal(globalConfig)
-	storage.WriteFile(configPath, configData, 0644)
+	if err := storage.WriteFile(configPath, configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create test instance
 	instanceName := "test-instance"
@@ -54,7 +56,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
 	instanceConfig.Cloud.InternalDomain = "internal.test.local"
 	instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
 	instanceConfigData, _ := yaml.Marshal(instanceConfig)
-	storage.WriteFile(instanceConfigPath, instanceConfigData, 0644)
+	if err := storage.WriteFile(instanceConfigPath, instanceConfigData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Test generate without overwrite
 	req := httptest.NewRequest("POST", "/api/v1/dnsmasq/generate", nil)
@@ -67,7 +71,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
 	}

 	var resp map[string]interface{}
-	json.Unmarshal(w.Body.Bytes(), &resp)
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to unmarshal response: %v", err)
+	}

 	// Verify response contains config
 	if config, ok := resp["config"].(string); !ok || config == "" {
@@ -90,7 +96,9 @@ func TestDnsmasqGenerate_WithOverwrite(t *testing.T) {
 	globalConfig.Cloud.Router.IP = "192.168.1.1"
 	configPath := filepath.Join(tmpDir, "config.yaml")
 	configData, _ := yaml.Marshal(globalConfig)
-	storage.WriteFile(configPath, configData, 0644)
+	if err := storage.WriteFile(configPath, configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create test instance
 	instanceName := "test-instance"
@@ -103,7 +111,9 @@ func TestDnsmasqGenerate_WithOverwrite(t *testing.T) {
 	instanceConfig.Cluster.LoadBalancerIp = "192.168.1.80"
 	instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
 	instanceConfigData, _ := yaml.Marshal(instanceConfig)
-	storage.WriteFile(instanceConfigPath, instanceConfigData, 0644)
+	if err := storage.WriteFile(instanceConfigPath, instanceConfigData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Instead of calling the handler which would try to restart the service,
 	// directly test the UpdateConfig method with restart=false
@@ -201,8 +211,12 @@ func TestDnsmasqGetConfig(t *testing.T) {
 	// Write a config first
 	configPath := api.dnsmasq.GetConfigPath()
 	testConfig := "# Test config\ninterface=eth0\n"
-	os.MkdirAll(filepath.Dir(configPath), 0755)
-	os.WriteFile(configPath, []byte(testConfig), 0644)
+	if err := os.MkdirAll(filepath.Dir(configPath), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(configPath, []byte(testConfig), 0644); err != nil {
+		t.Fatal(err)
+	}

 	req := httptest.NewRequest("GET", "/api/v1/dnsmasq/config", nil)
 	w := httptest.NewRecorder()
@@ -214,7 +228,9 @@ func TestDnsmasqGetConfig(t *testing.T) {
 	}

 	var resp map[string]interface{}
-	json.Unmarshal(w.Body.Bytes(), &resp)
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to unmarshal response: %v", err)
+	}

 	content, ok := resp["content"].(string)
 	if !ok || content != testConfig {
--- a/api/internal/api/v1/handlers_node.go
+++ b/api/internal/api/v1/handlers_node.go
@@ -309,6 +309,38 @@ func (api *API) NodeDiscoveryCancel(w http.ResponseWriter, r *http.Request) {
 	})
 }

+// NodeReboot reboots a node without wiping state
+func (api *API) NodeReboot(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+	nodeIdentifier := GetNodeName(r)
+
+	nodeMgr := node.NewManager(api.dataDir, instanceName)
+	if err := nodeMgr.Reboot(instanceName, nodeIdentifier); err != nil {
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to reboot node: %v", err))
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{
+		"message": "Node reboot initiated",
+		"node":    nodeIdentifier,
+	})
+}
+
+// NodeHealth checks node health via Talos service statuses and dmesg
+func (api *API) NodeHealth(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+	nodeIdentifier := GetNodeName(r)
+
+	nodeMgr := node.NewManager(api.dataDir, instanceName)
+	health, err := nodeMgr.Health(instanceName, nodeIdentifier)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to check node health: %v", err))
+		return
+	}
+
+	respondJSON(w, http.StatusOK, health)
+}
+
 // NodeReset resets a node to maintenance mode
 func (api *API) NodeReset(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)
--- a/api/internal/api/v1/handlers_pxe.go
+++ b/api/internal/api/v1/handlers_pxe.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"

 	"github.com/gorilla/mux"
@@ -18,7 +18,7 @@ func (api *API) PXEListAssets(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use GET /api/v1/assets/{schematicId} instead.")
-	log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets called", instanceName)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets", "instance", instanceName)

 	// Get schematic ID from instance config
 	configPath := api.instance.GetInstanceConfigPath(instanceName)
@@ -49,7 +49,7 @@ func (api *API) PXEDownloadAsset(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use POST /api/v1/assets/{schematicId}/download instead.")
-	log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets/download called", instanceName)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/download", "instance", instanceName)

 	// Parse request
 	var req struct {
@@ -123,7 +123,7 @@ func (api *API) PXEGetAsset(w http.ResponseWriter, r *http.Request) {
 	assetType := mux.Vars(r)["type"]

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use GET /api/v1/assets/{schematicId}/pxe/{assetType} instead.")
-	log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets/%s called", instanceName, assetType)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/get", "instance", instanceName, "assetType", assetType)

 	// Get schematic ID from instance config
 	configPath := api.instance.GetInstanceConfigPath(instanceName)
@@ -162,7 +162,7 @@ func (api *API) PXEDeleteAsset(w http.ResponseWriter, r *http.Request) {
 	assetType := mux.Vars(r)["type"]

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use DELETE /api/v1/assets/{schematicId} instead.")
-	log.Printf("Warning: Deprecated endpoint DELETE /api/v1/instances/%s/pxe/assets/%s called", instanceName, assetType)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/delete", "instance", instanceName, "assetType", assetType)

 	// Get schematic ID from instance config
 	configPath := api.instance.GetInstanceConfigPath(instanceName)
--- a/api/internal/api/v1/handlers_sse.go
+++ b/api/internal/api/v1/handlers_sse.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"strings"
 	"time"
@@ -52,13 +52,12 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 	if err != nil {
 		// Default to empty string if not found - talos events will be skipped
 		nodeIP = ""
-		log.Printf("Control plane VIP not found for instance %s, Talos events will be disabled", instanceName)
+		slog.Info("control plane VIP not found, Talos events disabled", "instance", instanceName)
 	}

 	// Start watchers for this instance if not already running
 	if err := api.watcherManager.StartWatchers(instanceName, kubeconfigPath, talosconfigPath, nodeIP); err != nil {
-		log.Printf("Failed to start watchers for instance %s: %v", instanceName, err)
-		// Continue anyway - client might still receive events from other sources
+		slog.Error("failed to start watchers", "instance", instanceName, "error", err)
 	}

 	// 7. Send initial connected event
@@ -71,7 +70,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 		},
 	}
 	if err := sendSSEEvent(w, connectedEvent); err != nil {
-		log.Printf("Failed to send connected event: %v", err)
+		slog.Error("failed to send SSE connected event", "error", err)
 		return
 	}

@@ -98,7 +97,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 		case event := <-client.Channel:
 			// Send event to client
 			if err := sendSSEEvent(w, event); err != nil {
-				log.Printf("Failed to send event: %v", err)
+				slog.Error("failed to send SSE event", "error", err)
 				return
 			}

@@ -117,7 +116,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 				},
 			}
 			if err := sendSSEEvent(w, heartbeatEvent); err != nil {
-				log.Printf("Failed to send heartbeat: %v", err)
+				slog.Error("failed to send SSE heartbeat", "error", err)
 				return
 			}

@@ -190,7 +189,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
 		},
 	}
 	if err := sendSSEEvent(w, connectedEvent); err != nil {
-		log.Printf("Failed to send connected event: %v", err)
+		slog.Error("failed to send SSE connected event", "error", err)
 		return
 	}

@@ -217,7 +216,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
 		case event := <-client.Channel:
 			// Send event to client
 			if err := sendSSEEvent(w, event); err != nil {
-				log.Printf("Failed to send event: %v", err)
+				slog.Error("failed to send SSE event", "error", err)
 				return
 			}

@@ -236,7 +235,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
 				},
 			}
 			if err := sendSSEEvent(w, heartbeatEvent); err != nil {
-				log.Printf("Failed to send heartbeat: %v", err)
+				slog.Error("failed to send SSE heartbeat", "error", err)
 				return
 			}

@@ -263,4 +262,4 @@ func parseQueryList(param string) []string {
 		}
 	}
 	return result
-}
+}
--- a/api/internal/api/v1/handlers_terminal_ws.go
+++ b/api/internal/api/v1/handlers_terminal_ws.go
@@ -63,11 +63,11 @@ func (api *API) TerminalWebSocket(w http.ResponseWriter, r *http.Request) {

 	ptmx, err := pty.Start(cmd)
 	if err != nil {
-		conn.WriteMessage(websocket.TextMessage, []byte("Failed to start shell: "+err.Error()))
+		_ = conn.WriteMessage(websocket.TextMessage, []byte("Failed to start shell: "+err.Error()))
 		return
 	}
 	defer ptmx.Close()
-	defer cmd.Process.Kill()
+	defer func() { _ = cmd.Process.Kill() }()

 	// Channel to signal when to stop
 	done := make(chan struct{})
@@ -103,7 +103,7 @@ func (api *API) TerminalWebSocket(w http.ResponseWriter, r *http.Request) {
 		var resize terminalResize
 		if err := json.Unmarshal(msg, &resize); err == nil && resize.Type == "resize" {
 			if resize.Cols > 0 && resize.Rows > 0 {
-				pty.Setsize(ptmx, &pty.Winsize{
+				_ = pty.Setsize(ptmx, &pty.Winsize{
 					Cols: uint16(resize.Cols),
 					Rows: uint16(resize.Rows),
 				})
--- a/api/internal/api/v1/helpers.go
+++ b/api/internal/api/v1/helpers.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"fmt"
 	"io"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"
 	"strings"
@@ -15,28 +15,6 @@ import (
 	"gopkg.in/yaml.v3"
 )

-// getNestedValue retrieves a value from a nested map using dot notation path.
-// For example, getNestedValue(data, "cluster.nodes.active") returns data["cluster"]["nodes"]["active"].
-func getNestedValue(data map[string]interface{}, path string) interface{} {
-	keys := strings.Split(path, ".")
-	current := data
-
-	for i, key := range keys {
-		if i == len(keys)-1 {
-			return current[key]
-		}
-
-		if next, ok := current[key].(map[string]interface{}); ok {
-			current = next
-		} else {
-			return nil
-		}
-	}
-
-	return nil
-}
-
-
 // updateYAMLFile updates a YAML file with the provided key-value pairs.
 // It performs a shallow merge at the top level, preserving unmodified keys.
 func (api *API) updateYAMLFile(w http.ResponseWriter, r *http.Request, instanceName, fileType string) {
@@ -119,26 +97,26 @@ func (api *API) updateYAMLFile(w http.ResponseWriter, r *http.Request, instanceN
 		return
 	}

+	slog.Info(fileType+" updated", "instance", instanceName)
+
 	// Update DNS if domains changed
 	if domainsChanged && fileType == "config" {
 		go func() {
-			log.Printf("Domain change detected for instance %s, updating DNS configuration...", instanceName)
+			slog.Info("domain change detected, updating DNS", "instance", instanceName)

-			// Load the full instance config
 			instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
 			instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
 			if err != nil {
-				log.Printf("Failed to load instance config for DNS update: %v", err)
+				slog.Error("failed to load instance config for DNS update", "instance", instanceName, "error", err)
 				return
 			}

-			// Update the DNS configuration for this instance
 			if err := api.dnsmasq.UpdateInstanceDNS(instanceName, *instanceCfg); err != nil {
-				log.Printf("Failed to update DNS for instance %s: %v", instanceName, err)
+				slog.Error("failed to update DNS", "instance", instanceName, "error", err)
 				return
 			}

-			log.Printf("Successfully updated DNS configuration for instance %s", instanceName)
+			slog.Info("DNS configuration updated", "instance", instanceName)
 		}()
 	}

--- a/api/internal/api/v1/middleware.go
+++ b/api/internal/api/v1/middleware.go
@@ -2,11 +2,68 @@ package v1

 import (
 	"context"
+	"log/slog"
 	"net/http"
+	"strings"
+	"time"

 	"github.com/gorilla/mux"
 )

+// statusResponseWriter wraps http.ResponseWriter to capture the status code.
+type statusResponseWriter struct {
+	http.ResponseWriter
+	status int
+}
+
+func (w *statusResponseWriter) WriteHeader(code int) {
+	w.status = code
+	w.ResponseWriter.WriteHeader(code)
+}
+
+// RequestLoggingMiddleware logs method, path, status, and duration for each request.
+// Long-lived connections (SSE, WebSocket) are excluded.
+func RequestLoggingMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		path := r.URL.Path
+
+		// Skip SSE and WebSocket endpoints (long-lived connections)
+		if strings.HasSuffix(path, "/events") || strings.HasSuffix(path, "/ws") || strings.HasSuffix(path, "/stream") {
+			next.ServeHTTP(w, r)
+			return
+		}
+
+		start := time.Now()
+		sw := &statusResponseWriter{ResponseWriter: w, status: http.StatusOK}
+		next.ServeHTTP(sw, r)
+
+		attrs := []any{
+			"status", sw.status,
+			"method", r.Method,
+			"path", path,
+			"duration", time.Since(start),
+		}
+
+		// Add route params if present
+		vars := mux.Vars(r)
+		if name := vars["name"]; name != "" {
+			attrs = append(attrs, "instance", name)
+		}
+		if app := vars["app"]; app != "" {
+			attrs = append(attrs, "app", app)
+		}
+		if node := vars["node"]; node != "" {
+			attrs = append(attrs, "node", node)
+		}
+
+		if sw.status >= 400 {
+			slog.Error("request", attrs...)
+		} else {
+			slog.Info("request", attrs...)
+		}
+	})
+}
+
 // contextKey is a type for context keys to avoid collisions.
 type contextKey string

--- a/api/internal/api/v1/requests.go
+++ b/api/internal/api/v1/requests.go
@@ -75,15 +75,15 @@ type RestoreRequest struct {

 // ScheduleCreateRequest is the request body for creating a backup schedule.
 type ScheduleCreateRequest struct {
-	Name       string                  `json:"name"`
-	TargetType string                  `json:"target_type"` // "app" or "cluster"
-	TargetName string                  `json:"target_name"`
-	Frequency  string                  `json:"frequency"` // "daily", "weekly", "monthly"
-	Time       string                  `json:"time"`      // "HH:MM"
-	DayOfWeek  int                     `json:"day_of_week,omitempty"`
-	DayOfMonth int                     `json:"day_of_month,omitempty"`
-	Retention  *ScheduleRetentionReq   `json:"retention,omitempty"`
-	Enabled    bool                    `json:"enabled"`
+	Name       string                `json:"name"`
+	TargetType string                `json:"target_type"` // "app" or "cluster"
+	TargetName string                `json:"target_name"`
+	Frequency  string                `json:"frequency"` // "daily", "weekly", "monthly"
+	Time       string                `json:"time"`      // "HH:MM"
+	DayOfWeek  int                   `json:"day_of_week,omitempty"`
+	DayOfMonth int                   `json:"day_of_month,omitempty"`
+	Retention  *ScheduleRetentionReq `json:"retention,omitempty"`
+	Enabled    bool                  `json:"enabled"`
 }

 // ScheduleRetentionReq is the retention override in a schedule request.
@@ -94,13 +94,13 @@ type ScheduleRetentionReq struct {

 // ScheduleUpdateRequest is the request body for updating a backup schedule.
 type ScheduleUpdateRequest struct {
-	Name       *string                 `json:"name,omitempty"`
-	Frequency  *string                 `json:"frequency,omitempty"`
-	Time       *string                 `json:"time,omitempty"`
-	DayOfWeek  *int                    `json:"day_of_week,omitempty"`
-	DayOfMonth *int                    `json:"day_of_month,omitempty"`
-	Retention  *ScheduleRetentionReq   `json:"retention,omitempty"`
-	Enabled    *bool                   `json:"enabled,omitempty"`
+	Name       *string               `json:"name,omitempty"`
+	Frequency  *string               `json:"frequency,omitempty"`
+	Time       *string               `json:"time,omitempty"`
+	DayOfWeek  *int                  `json:"day_of_week,omitempty"`
+	DayOfMonth *int                  `json:"day_of_month,omitempty"`
+	Retention  *ScheduleRetentionReq `json:"retention,omitempty"`
+	Enabled    *bool                 `json:"enabled,omitempty"`
 }

 // NodeUpgradeRequest is the request body for upgrading a node's Talos version.
--- a/api/internal/apps/apps.go
+++ b/api/internal/apps/apps.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -471,7 +472,6 @@ func fetchIngressURLs(kubeconfigPath string) map[string]string {
 	return result
 }

-
 // processSecretTemplate processes a gomplate template for secret defaults
 // This function uses named contexts for config and secrets (e.g., {{ .config.apps.loomio.db.user }}, {{ .secrets.apps.loomio.dbPassword }})
 func processSecretTemplate(template string, appName string, configFile, secretsFile string, gomplate *tools.Gomplate) (string, error) {
@@ -608,6 +608,8 @@ func setNestedConfig(yq *tools.YQ, configFile, basePath string, value interface{

 // Add adds an app to the instance configuration
 func (m *Manager) Add(instanceName, appName, version string, config map[string]interface{}, requiredAppMappings map[string]string) error {
+	slog.Info("adding app", "component", "apps", "instance", instanceName, "app", appName, "version", version)
+
 	// 1. Verify app exists, optionally at a specific version
 	sourceAppDir, meta, err := m.resolveAppDir(appName, version)
 	if err != nil {
@@ -782,11 +784,14 @@ func (m *Manager) Add(instanceName, appName, version string, config map[string]i
 		return fmt.Errorf("failed to compile app templates: %w", err)
 	}

+	slog.Info("app added", "component", "apps", "instance", instanceName, "app", appName)
 	return nil
 }

 // Deploy deploys an app to the cluster
 func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster *operations.Broadcaster) error {
+	slog.Info("deploying app", "component", "apps", "instance", instanceName, "app", appName)
+
 	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
 	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
 	secretsFile := tools.GetInstanceSecretsPath(m.dataDir, instanceName)
@@ -812,7 +817,9 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
 	if storage.FileExists(manifestPath) {
 		manifestData, err := os.ReadFile(manifestPath)
 		if err == nil {
-			yaml.Unmarshal(manifestData, &manifest)
+			if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
+				slog.Error("failed to parse manifest", "component", "apps", "path", manifestPath, "error", err)
+			}
 		}
 	}

@@ -884,7 +891,7 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
 		for _, secretName := range wildcardSecrets {
 			if bytes.Contains(ingressContent, []byte(secretName)) {
 				if err := utilities.CopySecretBetweenNamespaces(kubeconfigPath, secretName, "cert-manager", namespace); err != nil {
-					fmt.Printf("Warning: Failed to copy TLS secret %s: %v\n", secretName, err)
+					slog.Error("failed to copy TLS secret", "component", "apps", "secret", secretName, "error", err)
 				}
 			}
 		}
@@ -1012,6 +1019,7 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
 		}
 	}

+	slog.Info("app deployed", "component", "apps", "instance", instanceName, "app", appName, "namespace", namespace)
 	return nil
 }

@@ -1035,6 +1043,8 @@ func (m *Manager) waitForRollout(kubeconfigPath, namespace string, wait *Rollout

 // Restart performs a rolling restart of all deployments and statefulsets in an app's namespace
 func (m *Manager) Restart(instanceName, appName string) error {
+	slog.Info("restarting app", "component", "apps", "instance", instanceName, "app", appName)
+
 	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
 	namespace := m.ResolveNamespace(instanceName, appName)

@@ -1083,6 +1093,8 @@ func (m *Manager) namespaceSharedByOtherApp(instanceName, appName, namespace str

 // Delete removes an app from the cluster and configuration
 func (m *Manager) Delete(instanceName, appName string) error {
+	slog.Info("deleting app", "component", "apps", "instance", instanceName, "app", appName)
+
 	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
 	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
 	configFile := tools.GetInstanceConfigPath(m.dataDir, instanceName)
@@ -1146,6 +1158,7 @@ func (m *Manager) Delete(instanceName, appName string) error {
 		}
 	}

+	slog.Info("app deleted", "component", "apps", "instance", instanceName, "app", appName)
 	return nil
 }

@@ -1174,8 +1187,12 @@ func (m *Manager) GetStatus(instanceName, appName string) (*DeployedApp, error)
 	manifestPath := filepath.Join(appDir, "manifest.yaml")
 	var manifest AppManifest
 	if storage.FileExists(manifestPath) {
-		manifestData, _ := os.ReadFile(manifestPath)
-		yaml.Unmarshal(manifestData, &manifest)
+		manifestData, err := os.ReadFile(manifestPath)
+		if err == nil {
+			if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
+				slog.Error("failed to parse manifest", "component", "apps", "path", manifestPath, "error", err)
+			}
+		}
 		app.Version = manifest.Version
 	}

@@ -1651,7 +1668,7 @@ func (m *Manager) updateFromSource(instanceName, appName, sourceDir, preserveSou
 		return fmt.Errorf("failed to backup old package: %w", err)
 	}
 	if err := os.Rename(tempDir, packageDir); err != nil {
-		os.Rename(oldPackageDir, packageDir)
+		_ = os.Rename(oldPackageDir, packageDir)
 		return fmt.Errorf("failed to update package: %w", err)
 	}

@@ -1660,7 +1677,7 @@ func (m *Manager) updateFromSource(instanceName, appName, sourceDir, preserveSou

 	rollback := func() {
 		os.RemoveAll(packageDir)
-		os.Rename(oldPackageDir, packageDir)
+		_ = os.Rename(oldPackageDir, packageDir)
 	}

 	// Read the new manifest
@@ -1917,7 +1934,7 @@ func (m *Manager) runMigrationJobs(instanceName, appName string, jobPaths []stri
 		// Clean up the job
 		cmd = exec.Command("kubectl", "delete", "-f", jobFile, "-n", namespace, "--ignore-not-found")
 		tools.WithKubeconfig(cmd, kubeconfigPath)
-		cmd.CombinedOutput() // Best effort cleanup
+		_, _ = cmd.CombinedOutput() // Best effort cleanup
 	}

 	return nil
@@ -1925,6 +1942,8 @@ func (m *Manager) runMigrationJobs(instanceName, appName string, jobPaths []stri

 // Eject converts an app from package-managed to custom
 func (m *Manager) Eject(instanceName, appName string) error {
+	slog.Info("ejecting app to custom management", "component", "apps", "instance", instanceName, "app", appName)
+
 	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
 	appDestDir := filepath.Join(instancePath, "apps", appName)
 	packageDir := filepath.Join(appDestDir, ".package")
@@ -2120,6 +2139,7 @@ func (m *Manager) Compile(instanceName, appName string) error {
 		return fmt.Errorf("app %s has no package source (custom or not installed)", appName)
 	}

+	slog.Info("compiling app templates", "component", "apps", "instance", instanceName, "app", appName)
 	return m.compileFromPackage(appName, appDestDir, packageDir, configFile, secretsFile)
 }

@@ -2214,7 +2234,9 @@ func (m *Manager) Fetch(instanceName, appName string) error {

 			manifestYAML, err := yaml.Marshal(manifest)
 			if err == nil {
-				storage.WriteFile(manifestPath, manifestYAML, 0644)
+				if err := storage.WriteFile(manifestPath, manifestYAML, 0644); err != nil {
+					slog.Error("failed to write manifest", "component", "apps", "path", manifestPath, "error", err)
+				}
 			}
 		}
 	}
--- a/api/internal/apps/apps_test.go
+++ b/api/internal/apps/apps_test.go
@@ -1342,9 +1342,14 @@ source: /apps/ejectapp
 	}

 	// Verify source was removed from manifest
-	manifestData, _ := os.ReadFile(manifestPath)
+	manifestData, err := os.ReadFile(manifestPath)
+	if err != nil {
+		t.Fatalf("failed to read manifest: %v", err)
+	}
 	var manifest AppManifest
-	yaml.Unmarshal(manifestData, &manifest)
+	if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
+		t.Fatalf("failed to parse manifest: %v", err)
+	}
 	if manifest.Source != "" {
 		t.Errorf("Source should be removed from manifest after eject, got: %s", manifest.Source)
 	}
@@ -1563,10 +1568,10 @@ func TestCopyDir(t *testing.T) {

 	// Create files at various levels
 	files := map[string]string{
-		filepath.Join(srcDir, "top-level.yaml"):              "top: level",
-		filepath.Join(installDir, "install.yaml"):            "install: data",
-		filepath.Join(installDir, "nested", "deep.yaml"):     "deep: data",
-		filepath.Join(configDir, "config.yaml"):              "config: data",
+		filepath.Join(srcDir, "top-level.yaml"):          "top: level",
+		filepath.Join(installDir, "install.yaml"):        "install: data",
+		filepath.Join(installDir, "nested", "deep.yaml"): "deep: data",
+		filepath.Join(configDir, "config.yaml"):          "config: data",
 	}
 	for path, content := range files {
 		if err := os.WriteFile(path, []byte(content), 0644); err != nil {
@@ -1748,10 +1753,10 @@ deploy:

 func TestResolveDeploymentResource(t *testing.T) {
 	tests := []struct {
-		name         string
-		manifest     AppManifest
-		wantName     string
-		wantKind     string
+		name     string
+		manifest AppManifest
+		wantName string
+		wantKind string
 	}{
 		{
 			name:     "no deployment info",
@@ -1845,7 +1850,9 @@ func TestIsConfigOnly(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			appDir := t.TempDir()
 			for _, f := range tt.files {
-				os.WriteFile(filepath.Join(appDir, f), []byte("test"), 0644)
+				if err := os.WriteFile(filepath.Join(appDir, f), []byte("test"), 0644); err != nil {
+					t.Fatalf("failed to write test file: %v", err)
+				}
 			}
 			if got := isConfigOnly(appDir); got != tt.want {
 				t.Errorf("isConfigOnly() = %v, want %v", got, tt.want)
--- a/api/internal/apps/drift_test.go
+++ b/api/internal/apps/drift_test.go
@@ -38,14 +38,20 @@ func TestFilesDiffer(t *testing.T) {
 	}
 	defer os.RemoveAll(tmpDir)

+	mustWrite := func(path string, data []byte) {
+		t.Helper()
+		if err := os.WriteFile(path, data, 0644); err != nil {
+			t.Fatal(err)
+		}
+	}
 	fileA := filepath.Join(tmpDir, "a.txt")
 	fileB := filepath.Join(tmpDir, "b.txt")
 	fileC := filepath.Join(tmpDir, "c.txt")
 	fileMissing := filepath.Join(tmpDir, "missing.txt")

-	os.WriteFile(fileA, []byte("hello"), 0644)
-	os.WriteFile(fileB, []byte("hello"), 0644)
-	os.WriteFile(fileC, []byte("world"), 0644)
+	mustWrite(fileA, []byte("hello"))
+	mustWrite(fileB, []byte("hello"))
+	mustWrite(fileC, []byte("world"))

 	t.Run("identical files", func(t *testing.T) {
 		if filesDiffer(fileA, fileB) {
@@ -79,13 +85,26 @@ func TestDirsDiffer(t *testing.T) {
 	}
 	defer os.RemoveAll(tmpDir)

+	mustMkdir := func(path string) {
+		t.Helper()
+		if err := os.MkdirAll(path, 0755); err != nil {
+			t.Fatal(err)
+		}
+	}
+	mustWriteFile := func(path string, data []byte) {
+		t.Helper()
+		if err := os.WriteFile(path, data, 0644); err != nil {
+			t.Fatal(err)
+		}
+	}
+
 	// Create two identical directories
 	dirA := filepath.Join(tmpDir, "a")
 	dirB := filepath.Join(tmpDir, "b")
-	os.MkdirAll(dirA, 0755)
-	os.MkdirAll(dirB, 0755)
-	os.WriteFile(filepath.Join(dirA, "file.txt"), []byte("same"), 0644)
-	os.WriteFile(filepath.Join(dirB, "file.txt"), []byte("same"), 0644)
+	mustMkdir(dirA)
+	mustMkdir(dirB)
+	mustWriteFile(filepath.Join(dirA, "file.txt"), []byte("same"))
+	mustWriteFile(filepath.Join(dirB, "file.txt"), []byte("same"))

 	t.Run("identical directories", func(t *testing.T) {
 		if dirsDiffer(dirA, dirB) {
@@ -95,8 +114,8 @@ func TestDirsDiffer(t *testing.T) {

 	// Create a directory with different content
 	dirC := filepath.Join(tmpDir, "c")
-	os.MkdirAll(dirC, 0755)
-	os.WriteFile(filepath.Join(dirC, "file.txt"), []byte("different"), 0644)
+	mustMkdir(dirC)
+	mustWriteFile(filepath.Join(dirC, "file.txt"), []byte("different"))

 	t.Run("different content", func(t *testing.T) {
 		if !dirsDiffer(dirA, dirC) {
@@ -106,9 +125,9 @@ func TestDirsDiffer(t *testing.T) {

 	// Directory with extra file
 	dirD := filepath.Join(tmpDir, "d")
-	os.MkdirAll(dirD, 0755)
-	os.WriteFile(filepath.Join(dirD, "file.txt"), []byte("same"), 0644)
-	os.WriteFile(filepath.Join(dirD, "extra.txt"), []byte("extra"), 0644)
+	mustMkdir(dirD)
+	mustWriteFile(filepath.Join(dirD, "file.txt"), []byte("same"))
+	mustWriteFile(filepath.Join(dirD, "extra.txt"), []byte("extra"))

 	t.Run("extra file in second", func(t *testing.T) {
 		if !dirsDiffer(dirA, dirD) {
@@ -126,14 +145,20 @@ func TestCheckSourceDrift_NoDrift(t *testing.T) {

 	// Create source directory with manifest
 	sourceDir := filepath.Join(tmpDir, "source", "myapp")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}
 	sourceManifest := AppManifest{Version: "1.0.0"}
 	data, _ := yaml.Marshal(sourceManifest)
-	os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
+	if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create package dir (it exists)
 	packageDir := filepath.Join(tmpDir, "package")
-	os.MkdirAll(packageDir, 0755)
+	if err := os.MkdirAll(packageDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Installed manifest with same version
 	manifest := &AppManifest{
@@ -157,14 +182,20 @@ func TestCheckSourceDrift_VersionDrift(t *testing.T) {

 	// Create source directory with newer version
 	sourceDir := filepath.Join(tmpDir, "source", "myapp")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}
 	sourceManifest := AppManifest{Version: "2.0.0"}
 	data, _ := yaml.Marshal(sourceManifest)
-	os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
+	if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create package dir
 	packageDir := filepath.Join(tmpDir, "package")
-	os.MkdirAll(packageDir, 0755)
+	if err := os.MkdirAll(packageDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Installed manifest with older version
 	manifest := &AppManifest{
@@ -174,11 +205,8 @@ func TestCheckSourceDrift_VersionDrift(t *testing.T) {

 	m := &Manager{}
 	result := m.checkSourceDrift(manifest, packageDir, "myapp")
-	if result == nil {
-		t.Fatal("expected drift, got nil")
-	}
-	if !result.Drifted {
-		t.Error("expected Drifted to be true")
+	if result == nil || !result.Drifted {
+		t.Fatal("expected drift result with Drifted=true, got nil or false")
 	}
 	if result.CurrentVersion != "1.0.0" {
 		t.Errorf("expected CurrentVersion '1.0.0', got %q", result.CurrentVersion)
@@ -210,7 +238,9 @@ func TestCheckSourceDrift_PackageMissing(t *testing.T) {

 	// Source exists but .package/ does not
 	sourceDir := filepath.Join(tmpDir, "source", "myapp")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	manifest := &AppManifest{
 		Version: "1.0.0",
@@ -221,11 +251,8 @@ func TestCheckSourceDrift_PackageMissing(t *testing.T) {

 	m := &Manager{}
 	result := m.checkSourceDrift(manifest, packageDir, "myapp")
-	if result == nil {
-		t.Fatal("expected drift for missing package dir, got nil")
-	}
-	if !result.Drifted {
-		t.Error("expected Drifted to be true")
+	if result == nil || !result.Drifted {
+		t.Fatal("expected drift result with Drifted=true for missing package dir")
 	}
 }

@@ -270,17 +297,23 @@ func TestComputeDrift_NotDeployed(t *testing.T) {

 	// Source-managed app that is only "added" (not deployed)
 	sourceDir := filepath.Join(tmpDir, "source")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Source manifest with newer version
 	sourceManifest := AppManifest{Version: "2.0.0"}
 	data, _ := yaml.Marshal(sourceManifest)
-	os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
+	if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// App directory with .package
 	appDir := filepath.Join(tmpDir, "app")
 	packageDir := filepath.Join(appDir, ".package")
-	os.MkdirAll(packageDir, 0755)
+	if err := os.MkdirAll(packageDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	manifest := &AppManifest{
 		Version: "1.0.0",
@@ -290,13 +323,8 @@ func TestComputeDrift_NotDeployed(t *testing.T) {
 	m := &Manager{}
 	result := m.computeDrift("test-instance", "myapp", appDir, "", "added", manifest)

-	if result == nil {
-		t.Fatal("expected drift info, got nil")
-	}
-
-	// Should have source drift (version mismatch)
-	if result.Source == nil || !result.Source.Drifted {
-		t.Error("expected source drift for version mismatch")
+	if result == nil || result.Source == nil || !result.Source.Drifted {
+		t.Fatal("expected drift info with source drift for version mismatch")
 	}

 	// Should NOT have deploy drift (status is "added")
--- a/api/internal/apps/infrastructure.go
+++ b/api/internal/apps/infrastructure.go
@@ -2,7 +2,7 @@ package apps

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -110,14 +110,14 @@ func (m *Manager) DeployInfrastructure(instanceName, opID string, broadcaster *o
 	for i, pkg := range packages {
 		// Skip if already added and deployed
 		if m.isDeployed(instanceName, pkg.Name) {
-			log.Printf("[infrastructure] %s already deployed, skipping", pkg.Name)
+			slog.Info("already deployed, skipping", "component", "infrastructure", "package", pkg.Name)
 			if broadcaster != nil {
 				broadcaster.Publish(opID, []byte(fmt.Sprintf("Skipping %s (already deployed)\n", pkg.Name)))
 			}
 			continue
 		}

-		log.Printf("[infrastructure] Installing %s (%d/%d)", pkg.Name, i+1, total)
+		slog.Info("installing package", "component", "infrastructure", "package", pkg.Name, "progress", fmt.Sprintf("%d/%d", i+1, total))
 		if broadcaster != nil {
 			broadcaster.Publish(opID, []byte(fmt.Sprintf("Installing %s (%d/%d)...\n", pkg.Name, i+1, total)))
 		}
--- a/api/internal/apps/infrastructure_test.go
+++ b/api/internal/apps/infrastructure_test.go
@@ -135,8 +135,11 @@ func TestInfrastructureOrder(t *testing.T) {
 	tmpDir := t.TempDir()

 	writeManifest := func(name, category string, requires []string) {
+		t.Helper()
 		dir := filepath.Join(tmpDir, name)
-		os.MkdirAll(dir, 0755)
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		content := "name: " + name + "\ncategory: " + category + "\n"
 		if len(requires) > 0 {
@@ -145,7 +148,9 @@ func TestInfrastructureOrder(t *testing.T) {
 				content += "  - name: " + r + "\n"
 			}
 		}
-		os.WriteFile(filepath.Join(dir, "manifest.yaml"), []byte(content), 0644)
+		if err := os.WriteFile(filepath.Join(dir, "manifest.yaml"), []byte(content), 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	// Create infrastructure packages
--- a/api/internal/apps/models.go
+++ b/api/internal/apps/models.go
@@ -16,22 +16,22 @@ type ConfigItem struct {

 // AppManifest represents the complete app manifest from manifest.yaml
 type AppManifest struct {
-	Name             string                 `json:"name" yaml:"name"`
-	Is               string                 `json:"is,omitempty" yaml:"is,omitempty"` // The original app type (e.g., "postgres" even if named "postgres-primary")
-	Description      string                 `json:"description" yaml:"description"`
-	Version          string                 `json:"version" yaml:"version"`
-	Icon             string                 `json:"icon,omitempty" yaml:"icon,omitempty"`
-	Category         string                 `json:"category,omitempty" yaml:"category,omitempty"`
-	Namespace        string                 `json:"namespace,omitempty" yaml:"namespace,omitempty"`
-	DeploymentName   string                 `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"`
-	Requires         []AppDependency        `json:"requires,omitempty" yaml:"requires,omitempty"`
-	DefaultConfig    map[string]interface{} `json:"defaultConfig,omitempty" yaml:"defaultConfig,omitempty"`
-	DefaultSecrets   []SecretDefinition     `json:"defaultSecrets,omitempty" yaml:"defaultSecrets,omitempty"`
-	RequiredSecrets  []string               `json:"requiredSecrets,omitempty" yaml:"requiredSecrets,omitempty"`
-	Source           string                 `json:"source,omitempty" yaml:"source,omitempty"`
-	Scripts          []Script               `json:"scripts,omitempty" yaml:"scripts,omitempty"`
-	Deploy           *DeployConfig          `json:"deploy,omitempty" yaml:"deploy,omitempty"`
-	Upgrade          *UpgradeConfig         `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
+	Name            string                 `json:"name" yaml:"name"`
+	Is              string                 `json:"is,omitempty" yaml:"is,omitempty"` // The original app type (e.g., "postgres" even if named "postgres-primary")
+	Description     string                 `json:"description" yaml:"description"`
+	Version         string                 `json:"version" yaml:"version"`
+	Icon            string                 `json:"icon,omitempty" yaml:"icon,omitempty"`
+	Category        string                 `json:"category,omitempty" yaml:"category,omitempty"`
+	Namespace       string                 `json:"namespace,omitempty" yaml:"namespace,omitempty"`
+	DeploymentName  string                 `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"`
+	Requires        []AppDependency        `json:"requires,omitempty" yaml:"requires,omitempty"`
+	DefaultConfig   map[string]interface{} `json:"defaultConfig,omitempty" yaml:"defaultConfig,omitempty"`
+	DefaultSecrets  []SecretDefinition     `json:"defaultSecrets,omitempty" yaml:"defaultSecrets,omitempty"`
+	RequiredSecrets []string               `json:"requiredSecrets,omitempty" yaml:"requiredSecrets,omitempty"`
+	Source          string                 `json:"source,omitempty" yaml:"source,omitempty"`
+	Scripts         []Script               `json:"scripts,omitempty" yaml:"scripts,omitempty"`
+	Deploy          *DeployConfig          `json:"deploy,omitempty" yaml:"deploy,omitempty"`
+	Upgrade         *UpgradeConfig         `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
 }

 // DeployConfig declares deployment behavior in the manifest, replacing install.sh scripts
@@ -54,7 +54,7 @@ type DeployPhase struct {
 type CreateSecret struct {
 	Name      string            `json:"name" yaml:"name"`
 	Namespace string            `json:"namespace,omitempty" yaml:"namespace,omitempty"` // target namespace (defaults to app namespace)
-	Entries   map[string]string `json:"entries" yaml:"entries"`                        // k8s secret key -> secrets.yaml path
+	Entries   map[string]string `json:"entries" yaml:"entries"`                         // k8s secret key -> secrets.yaml path
 }

 // CRDInstall describes CRDs to apply from a URL before deployment
@@ -138,13 +138,13 @@ type UpgradeConfig struct {
 	From             []UpgradeFromRule `json:"from,omitempty" yaml:"from,omitempty"`
 	PreUpgrade       *PreUpgradeConfig `json:"preUpgrade,omitempty" yaml:"preUpgrade,omitempty"`
 	Migrations       *MigrationConfig  `json:"migrations,omitempty" yaml:"migrations,omitempty"`
-	ConfigMigrations map[string]string  `json:"configMigrations,omitempty" yaml:"configMigrations,omitempty"`
+	ConfigMigrations map[string]string `json:"configMigrations,omitempty" yaml:"configMigrations,omitempty"`
 }

 // UpgradeFromRule defines a version constraint and optional upgrade path
 type UpgradeFromRule struct {
-	Version string `json:"version" yaml:"version"`                        // e.g. ">=1.23.0", "<1.21.0", ">0"
-	Via     string `json:"via,omitempty" yaml:"via,omitempty"`             // waypoint version in versions/
+	Version string `json:"version" yaml:"version"`             // e.g. ">=1.23.0", "<1.21.0", ">0"
+	Via     string `json:"via,omitempty" yaml:"via,omitempty"` // waypoint version in versions/
 	Blocked bool   `json:"blocked,omitempty" yaml:"blocked,omitempty"`
 	Notes   string `json:"notes,omitempty" yaml:"notes,omitempty"`
 }
@@ -157,7 +157,7 @@ type PreUpgradeConfig struct {

 // MigrationConfig defines pre/post-deploy migration jobs for a version transition
 type MigrationConfig struct {
-	Pre  []string `json:"pre,omitempty" yaml:"pre,omitempty"`   // paths to K8s Job YAMLs relative to app dir
+	Pre  []string `json:"pre,omitempty" yaml:"pre,omitempty"` // paths to K8s Job YAMLs relative to app dir
 	Post []string `json:"post,omitempty" yaml:"post,omitempty"`
 }

--- a/api/internal/apps/upgrade.go
+++ b/api/internal/apps/upgrade.go
@@ -43,7 +43,7 @@ func ParseAppVersion(v string) (major, minor, patch, revision int) {
 		}
 	}

-	fmt.Sscanf(upstream, "%d.%d.%d", &major, &minor, &patch)
+	_, _ = fmt.Sscanf(upstream, "%d.%d.%d", &major, &minor, &patch)
 	return
 }

--- a/api/internal/apps/upgrade_test.go
+++ b/api/internal/apps/upgrade_test.go
@@ -10,7 +10,7 @@ import (

 func TestParseAppVersion(t *testing.T) {
 	tests := []struct {
-		input                          string
+		input                         string
 		major, minor, patch, revision int
 	}{
 		{"1.24.3-1", 1, 24, 3, 1},
@@ -347,9 +347,9 @@ func TestComputeUpgradePlan_MultipleWaypoints(t *testing.T) {
 		Latest: "4",
 		Upgrade: &UpgradeConfig{
 			From: []UpgradeFromRule{
-				{Version: ">=3.0.0"},               // direct from 3.x
-				{Version: ">=2.0.0", Via: "3"},     // 2.x must go through slot "3"
-				{Version: ">=1.0.0", Via: "2"},     // 1.x must go through slot "2"
+				{Version: ">=3.0.0"},           // direct from 3.x
+				{Version: ">=2.0.0", Via: "3"}, // 2.x must go through slot "3"
+				{Version: ">=1.0.0", Via: "2"}, // 1.x must go through slot "2"
 			},
 		},
 	})
@@ -524,7 +524,7 @@ func TestComputeUpgradePlan_RuleOrdering(t *testing.T) {
 		Latest: "3",
 		Upgrade: &UpgradeConfig{
 			From: []UpgradeFromRule{
-				{Version: ">=2.0.0"},                                            // direct for 2.x+
+				{Version: ">=2.0.0"}, // direct for 2.x+
 				{Version: ">=1.0.0", Blocked: true, Notes: "must be on 2.x+"}, // block for 1.x
 			},
 		},
--- a/api/internal/backup/backup.go
+++ b/api/internal/backup/backup.go
@@ -2,9 +2,13 @@
 package backup

 import (
+	"archive/tar"
 	"bytes"
+	"compress/gzip"
 	"encoding/json"
 	"fmt"
+	"io"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -188,6 +192,7 @@ func (m *Manager) BackupApp(instanceName, appName string) (*RecoveryPlan, error)
 		App:       appName,
 		Instance:  instanceName,
 		Timestamp: timestamp,
+		Version:   manifest.Version,
 		Status:    "backing_up",
 		Source: btypes.RecoverySource{
 			ActiveColor: activeColor,
@@ -315,7 +320,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
 		if err := strategy.Restore(plan, m.destination); err != nil {
 			plan.Status = "failed"
 			plan.Error = fmt.Sprintf("%s restore failed: %v", entry.Name, err)
-			m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 			return plan, fmt.Errorf("failed to restore %s: %w", entry.Name, err)
 		}
 	}
@@ -325,7 +330,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
 	if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
 		plan.Status = "failed"
 		plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
-		m.savePlan(instanceName, appName, plan.Timestamp, plan)
+		_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 		return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
 	}

@@ -335,7 +340,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
 	phase.CompletedAt = &completed
 	plan.Phases["restore"] = phase

-	m.savePlan(instanceName, appName, plan.Timestamp, plan)
+	_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 	m.reportProgress(100, "Restore completed")
 	return plan, nil
 }
@@ -370,7 +375,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
 		if err := strategy.Switch(plan); err != nil {
 			plan.Status = "failed"
 			plan.Error = fmt.Sprintf("%s switch failed: %v", entry.Name, err)
-			m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 			return plan, fmt.Errorf("failed to switch %s: %w", entry.Name, err)
 		}
 	}
@@ -380,7 +385,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
 	if err := m.setActiveDeployment(instanceName, appName, plan.StandbyColor); err != nil {
 		plan.Status = "failed"
 		plan.Error = fmt.Sprintf("failed to update activeDeployment: %v", err)
-		m.savePlan(instanceName, appName, plan.Timestamp, plan)
+		_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 		return plan, fmt.Errorf("failed to update activeDeployment: %w", err)
 	}

@@ -390,7 +395,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
 	phase.CompletedAt = &completed
 	plan.Phases["switch"] = phase

-	m.savePlan(instanceName, appName, plan.Timestamp, plan)
+	_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 	m.reportProgress(100, "Switch completed")
 	return plan, nil
 }
@@ -425,7 +430,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
 		if err := strategy.Cleanup(plan); err != nil {
 			plan.Status = "failed"
 			plan.Error = fmt.Sprintf("%s cleanup failed: %v", entry.Name, err)
-			m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 			return plan, fmt.Errorf("failed to cleanup %s: %w", entry.Name, err)
 		}
 	}
@@ -439,7 +444,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
 		deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
 		tools.WithKubeconfig(deleteCmd, kubeconfigPath)
 		if output, err := deleteCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to delete previous namespace %s: %v, output: %s\n", previousNamespace, err, output)
+			slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
 		}
 	} else if previousNamespace == appName {
 		// For the bare namespace (first restore), scale deployments to zero
@@ -447,7 +452,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
 		scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
 		tools.WithKubeconfig(scaleCmd, kubeconfigPath)
 		if output, err := scaleCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to scale down previous deployments in %s: %v, output: %s\n", previousNamespace, err, output)
+			slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
 		}
 	}

@@ -470,7 +475,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
 	phase.CompletedAt = &completed
 	plan.Phases["cleanup"] = phase

-	m.savePlan(instanceName, appName, plan.Timestamp, plan)
+	_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 	m.reportProgress(100, "Cleanup completed")
 	return plan, nil
 }
@@ -596,10 +601,10 @@ func (m *Manager) deployToStandbyNamespace(instanceName, appName string, plan *R

 	// Create secrets from secrets.yaml (source of truth) in the standby namespace
 	if err := m.deploySecretsToNamespace(instanceName, appName, standbyNamespace, kubeconfigPath); err != nil {
-		fmt.Printf("Warning: failed to deploy secrets to standby namespace: %v\n", err)
+		slog.Error("failed to deploy secrets to standby namespace", "component", "backup", "error", err)
 	}

-	fmt.Printf("Successfully deployed app to standby namespace: %s\n", standbyNamespace)
+	slog.Info("deployed app to standby namespace", "component", "backup", "namespace", standbyNamespace)
 	return nil
 }

@@ -1076,25 +1081,22 @@ func (m *Manager) DeleteAppBackup(instanceName, appName, timestamp string) error
 	backupDir := filepath.Join(m.GetBackupDir(instanceName), appName, timestamp)

 	if _, err := os.Stat(backupDir); os.IsNotExist(err) {
-		return fmt.Errorf("backup not found: %s", timestamp)
+		return nil // Already deleted, nothing to do
 	}

 	// Load plan to get strategy locations
 	planFile := filepath.Join(backupDir, "recovery-plan.yaml")
 	plan, err := m.loadPlan(planFile)

-	// Load destination
+	// Load destination and clean up remote files (best-effort)
 	destination, err2 := m.loadDestination(instanceName)
 	if err2 != nil {
-		return fmt.Errorf("failed to load backup destination: %w", err2)
-	}
-
-	// Delete strategy data from destination
-	if err == nil && plan != nil {
+		slog.Error("could not load backup destination, remote files may be orphaned", "component", "backup", "error", err2)
+	} else if err == nil && plan != nil {
 		for _, entry := range plan.Strategies {
 			if location, ok := entry.Backup["location"].(string); ok && location != "" {
 				if delErr := destination.Delete(location); delErr != nil {
-					fmt.Printf("Warning: failed to delete %s from destination: %v\n", location, delErr)
+					slog.Error("failed to delete backup from destination", "component", "backup", "location", location, "error", delErr)
 				}
 			}
 		}
@@ -1241,6 +1243,149 @@ func (m *Manager) loadDestination(instanceName string) (BackupDestination, error
 	}
 }

+// BackupClusterConfig creates a backup of cluster-level configuration files for disaster recovery.
+// This backs up kubeconfig, talosconfig, config.yaml, secrets.yaml, and talos generated configs.
+func (m *Manager) BackupClusterConfig(instanceName string) (*RecoveryPlan, error) {
+	m.reportProgress(20, "Loading backup configuration")
+
+	destination, err := m.loadDestination(instanceName)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load backup destination: %w", err)
+	}
+	m.destination = destination
+
+	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
+
+	// Collect files to back up (skip missing gracefully)
+	filePaths := []string{
+		tools.GetKubeconfigPath(m.dataDir, instanceName),
+		tools.GetInstanceConfigPath(m.dataDir, instanceName),
+		tools.GetInstanceSecretsPath(m.dataDir, instanceName),
+		tools.GetTalosconfigPath(m.dataDir, instanceName),
+		filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "controlplane.yaml"),
+		filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "worker.yaml"),
+		filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "secrets.yaml"),
+	}
+
+	var existingFiles []string
+	for _, f := range filePaths {
+		if _, err := os.Stat(f); err == nil {
+			existingFiles = append(existingFiles, f)
+		}
+	}
+
+	if len(existingFiles) == 0 {
+		return nil, fmt.Errorf("no cluster config files found for instance %s", instanceName)
+	}
+
+	m.reportProgress(40, fmt.Sprintf("Archiving %d cluster config files", len(existingFiles)))
+
+	timestamp := time.Now().UTC().Format("20060102T150405Z")
+	key := fmt.Sprintf("cluster-config/%s/%s.tar.gz", instanceName, timestamp)
+
+	// Create tar.gz archive in memory
+	var buf bytes.Buffer
+	gzWriter := gzip.NewWriter(&buf)
+	tarWriter := tar.NewWriter(gzWriter)
+
+	totalSize := int64(0)
+	for _, filePath := range existingFiles {
+		file, err := os.Open(filePath)
+		if err != nil {
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to open %s: %w", filePath, err)
+		}
+
+		stat, err := file.Stat()
+		if err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to stat %s: %w", filePath, err)
+		}
+
+		header, err := tar.FileInfoHeader(stat, "")
+		if err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to create tar header for %s: %w", filePath, err)
+		}
+
+		// Use relative path from instance directory
+		relPath, _ := filepath.Rel(instancePath, filePath)
+		header.Name = relPath
+
+		if err := tarWriter.WriteHeader(header); err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to write tar header for %s: %w", filePath, err)
+		}
+
+		if _, err := io.Copy(tarWriter, file); err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to write file %s to archive: %w", filePath, err)
+		}
+
+		totalSize += stat.Size()
+		file.Close()
+	}
+
+	if err := tarWriter.Close(); err != nil {
+		gzWriter.Close()
+		return nil, fmt.Errorf("failed to close tar: %w", err)
+	}
+	if err := gzWriter.Close(); err != nil {
+		return nil, fmt.Errorf("failed to close gzip: %w", err)
+	}
+
+	m.reportProgress(70, "Uploading cluster config backup")
+
+	reader := bytes.NewReader(buf.Bytes())
+	size, err := destination.Put(key, reader)
+	if err != nil {
+		return nil, fmt.Errorf("failed to upload cluster config backup: %w", err)
+	}
+
+	m.reportProgress(90, "Saving recovery plan")
+
+	now := time.Now()
+	completed := time.Now()
+	plan := &RecoveryPlan{
+		App:       "_cluster",
+		Instance:  instanceName,
+		Timestamp: timestamp,
+		Status:    "backed_up",
+		Strategies: []StrategyEntry{
+			{
+				Name:   "cluster-config",
+				Status: "backed_up",
+				Backup: map[string]interface{}{
+					"location":  key,
+					"size":      size,
+					"files":     len(existingFiles),
+					"format":    "tar.gz",
+					"totalSize": totalSize,
+				},
+			},
+		},
+		Phases: map[string]PhaseTime{
+			"backup": {StartedAt: &now, CompletedAt: &completed},
+		},
+	}
+
+	if err := m.savePlan(instanceName, "_cluster", timestamp, plan); err != nil {
+		return nil, fmt.Errorf("failed to save recovery plan: %w", err)
+	}
+
+	m.reportProgress(100, "Cluster config backup completed")
+	return plan, nil
+}
+
 // savePlan saves a RecoveryPlan to YAML file
 func (m *Manager) savePlan(instanceName, appName, timestamp string, plan *RecoveryPlan) error {
 	backupDir := filepath.Join(m.GetBackupDir(instanceName), appName, timestamp)
--- a/api/internal/backup/cluster_backup_test.go
+++ b/api/internal/backup/cluster_backup_test.go
@@ -0,0 +1,188 @@
+package backup
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBackupClusterConfig(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+	require.NoError(t, os.MkdirAll(filepath.Join(instanceDir, "talos", "generated"), 0755))
+
+	// Create cluster config files
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "secrets.yaml"), []byte("secrets-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "talosconfig"), []byte("talosconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "controlplane.yaml"), []byte("controlplane-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "worker.yaml"), []byte("worker-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "secrets.yaml"), []byte("talos-secrets-data"), 0644))
+
+	mgr := NewManager(tempDir)
+	plan, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+	require.NotNil(t, plan)
+
+	assert.Equal(t, "_cluster", plan.App)
+	assert.Equal(t, instanceName, plan.Instance)
+	assert.Equal(t, "backed_up", plan.Status)
+	assert.Len(t, plan.Strategies, 1)
+	assert.Equal(t, "cluster-config", plan.Strategies[0].Name)
+	assert.Equal(t, "backed_up", plan.Strategies[0].Status)
+
+	// Check backup metadata
+	files, ok := plan.Strategies[0].Backup["files"].(int)
+	assert.True(t, ok)
+	assert.Equal(t, 7, files)
+
+	// Verify plan was saved to disk
+	planFile := filepath.Join(backupsDir, "_cluster", plan.Timestamp, "recovery-plan.yaml")
+	_, err = os.Stat(planFile)
+	assert.NoError(t, err, "recovery-plan.yaml should exist")
+}
+
+func TestBackupClusterConfigSkipsMissingFiles(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+	// Only create kubeconfig and config.yaml (no talos files)
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+
+	mgr := NewManager(tempDir)
+	plan, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+	require.NotNil(t, plan)
+
+	assert.Equal(t, "backed_up", plan.Status)
+	files, ok := plan.Strategies[0].Backup["files"].(int)
+	assert.True(t, ok)
+	assert.Equal(t, 2, files)
+}
+
+func TestBackupClusterConfigFailsWithNoFiles(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+	// Create only config.yaml for backup destination config, but none of the cluster files
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+
+	mgr := NewManager(tempDir)
+	_, err := mgr.BackupClusterConfig(instanceName)
+	// config.yaml itself is one of the files, so it will be found
+	// To truly have zero files, we need to remove config.yaml too,
+	// but then loadDestination fails first. So this test verifies
+	// that config.yaml IS included in the backup.
+	require.NoError(t, err)
+}
+
+func TestBackupClusterConfigArchiveContents(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+	require.NoError(t, os.MkdirAll(filepath.Join(instanceDir, "talos", "generated"), 0755))
+
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "talosconfig"), []byte("talosconfig-data"), 0644))
+
+	mgr := NewManager(tempDir)
+	plan, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+
+	// Read the archive from the local backup destination
+	location, ok := plan.Strategies[0].Backup["location"].(string)
+	require.True(t, ok)
+
+	archivePath := filepath.Join(backupsDir, location)
+	f, err := os.Open(archivePath)
+	require.NoError(t, err)
+	defer f.Close()
+
+	gzReader, err := gzip.NewReader(f)
+	require.NoError(t, err)
+	defer gzReader.Close()
+
+	tarReader := tar.NewReader(gzReader)
+
+	var fileNames []string
+	for {
+		header, err := tarReader.Next()
+		if err == io.EOF {
+			break
+		}
+		require.NoError(t, err)
+		fileNames = append(fileNames, header.Name)
+	}
+
+	assert.Contains(t, fileNames, "kubeconfig")
+	assert.Contains(t, fileNames, "config.yaml")
+	assert.Contains(t, fileNames, filepath.Join("talos", "generated", "talosconfig"))
+}
+
+func TestBackupClusterConfigListAndDelete(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+
+	mgr := NewManager(tempDir)
+
+	// Create two backups (sleep to ensure different timestamps)
+	plan1, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+	time.Sleep(1100 * time.Millisecond)
+	plan2, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+
+	// List backups
+	plans, err := mgr.ListBackups(instanceName, "_cluster")
+	require.NoError(t, err)
+	assert.Len(t, plans, 2)
+	// Newest first
+	assert.Equal(t, plan2.Timestamp, plans[0].Timestamp)
+	assert.Equal(t, plan1.Timestamp, plans[1].Timestamp)
+
+	// Delete one
+	err = mgr.DeleteAppBackup(instanceName, "_cluster", plan1.Timestamp)
+	require.NoError(t, err)
+
+	plans, err = mgr.ListBackups(instanceName, "_cluster")
+	require.NoError(t, err)
+	assert.Len(t, plans, 1)
+	assert.Equal(t, plan2.Timestamp, plans[0].Timestamp)
+}
--- a/api/internal/backup/config_loader.go
+++ b/api/internal/backup/config_loader.go
@@ -2,6 +2,7 @@ package backup

 import (
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -68,7 +69,7 @@ func LoadInstanceBackupConfig(dataDir, instanceName string) (*BackupConfiguratio
 	// Load credentials from secrets.yaml if needed
 	if err := loadBackupSecrets(dataDir, instanceName, config); err != nil {
 		// Secrets are optional, log but don't fail
-		fmt.Printf("Warning: failed to load backup secrets: %v\n", err)
+		slog.Error("failed to load backup secrets", "component", "backup", "error", err)
 	}

 	return config, nil
@@ -120,6 +121,63 @@ func SaveInstanceBackupSchedules(dataDir, instanceName string, schedules []Backu
 	return nil
 }

+// SaveInstanceBackupConfig writes the destination and retention sections of backup config.
+// Schedules are managed separately via SaveInstanceBackupSchedules.
+func SaveInstanceBackupConfig(dataDir, instanceName string, dest *DestinationConfig, retention *RetentionPolicy) error {
+	configPath := tools.GetInstanceConfigPath(dataDir, instanceName)
+
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return fmt.Errorf("failed to read config: %w", err)
+	}
+
+	var root map[string]interface{}
+	if err := yaml.Unmarshal(data, &root); err != nil {
+		return fmt.Errorf("failed to parse config: %w", err)
+	}
+
+	backupSection, ok := root["backup"].(map[string]interface{})
+	if !ok {
+		backupSection = make(map[string]interface{})
+		root["backup"] = backupSection
+	}
+
+	if dest != nil {
+		destData, err := yaml.Marshal(dest)
+		if err != nil {
+			return fmt.Errorf("failed to marshal destination: %w", err)
+		}
+		var destGeneric interface{}
+		if err := yaml.Unmarshal(destData, &destGeneric); err != nil {
+			return fmt.Errorf("failed to unmarshal destination: %w", err)
+		}
+		backupSection["destination"] = destGeneric
+	}
+
+	if retention != nil {
+		retData, err := yaml.Marshal(retention)
+		if err != nil {
+			return fmt.Errorf("failed to marshal retention: %w", err)
+		}
+		var retGeneric interface{}
+		if err := yaml.Unmarshal(retData, &retGeneric); err != nil {
+			return fmt.Errorf("failed to unmarshal retention: %w", err)
+		}
+		backupSection["retention"] = retGeneric
+	}
+
+	out, err := yaml.Marshal(root)
+	if err != nil {
+		return fmt.Errorf("failed to marshal config: %w", err)
+	}
+
+	if err := os.WriteFile(configPath, out, 0644); err != nil {
+		return fmt.Errorf("failed to write config: %w", err)
+	}
+
+	return nil
+}
+
 // loadBackupSecrets loads backup credentials from instance secrets.yaml
 func loadBackupSecrets(dataDir, instanceName string, config *BackupConfiguration) error {
 	secretsPath := filepath.Join(dataDir, "instances", instanceName, "secrets.yaml")
@@ -160,4 +218,4 @@ func loadBackupSecrets(dataDir, instanceName string, config *BackupConfiguration
 	}

 	return nil
-}
+}
--- a/api/internal/backup/destinations/azure.go
+++ b/api/internal/backup/destinations/azure.go
@@ -67,7 +67,7 @@ func (a *AzureDestination) Put(key string, reader io.Reader) (int64, error) {
 		blobURL,
 		azblob.UploadStreamToBlockBlobOptions{
 			BufferSize: 4 * 1024 * 1024, // 4MB buffer
-			MaxBuffers: 3,                // Limited for Raspberry Pi
+			MaxBuffers: 3,               // Limited for Raspberry Pi
 		},
 	)

@@ -208,4 +208,4 @@ func (a *AzureDestination) getCredential() azblob.StorageAccountCredential {
 	// as a field in the struct during initialization
 	// For now, return nil which means the SAS generation might fail
 	return nil
-}
+}
--- a/api/internal/backup/destinations/local.go
+++ b/api/internal/backup/destinations/local.go
@@ -3,6 +3,7 @@ package destinations
 import (
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
 	"path/filepath"
 	"time"
@@ -110,7 +111,7 @@ func (l *LocalDestination) List(prefix string) ([]btypes.BackupObject, error) {
 	err := filepath.Walk(searchPath, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			// Log error but continue walking
-			fmt.Printf("Warning: error walking path %s: %v\n", path, err)
+			slog.Error("error walking path", "component", "local", "path", path, "error", err)
 			return nil
 		}

@@ -190,4 +191,4 @@ func (l *LocalDestination) Cleanup(retention btypes.RetentionPolicy) error {
 	// This could implement retention policy enforcement
 	// For now, it's a no-op
 	return nil
-}
+}
--- a/api/internal/backup/destinations/local_test.go
+++ b/api/internal/backup/destinations/local_test.go
@@ -238,14 +238,14 @@ func TestLocalDestination_List(t *testing.T) {
 		require.NoError(t, os.WriteFile(fullPath, content, 0644))
 		// Set specific mod time for testing
 		modTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC)
-		os.Chtimes(fullPath, modTime, modTime)
+		require.NoError(t, os.Chtimes(fullPath, modTime, modTime))
 	}

 	tests := []struct {
-		name         string
-		prefix       string
-		expectCount  int
-		expectKeys   []string
+		name        string
+		prefix      string
+		expectCount int
+		expectKeys  []string
 	}{
 		{
 			name:        "list all",
@@ -357,4 +357,4 @@ func TestLocalDestination_GetDiskUsage(t *testing.T) {
 	usage, err = dest.GetDiskUsage()
 	assert.NoError(t, err)
 	assert.Equal(t, totalSize, usage)
-}
+}
--- a/api/internal/backup/destinations/nfs.go
+++ b/api/internal/backup/destinations/nfs.go
@@ -3,6 +3,7 @@ package destinations
 import (
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -29,6 +30,11 @@ func NewNFSDestination(cfg *btypes.NFSConfig) (*NFSDestination, error) {
 		mountPath = filepath.Join("/mnt/backup", strings.ReplaceAll(cfg.Server, ".", "-"), strings.ReplaceAll(cfg.Path, "/", "-"))
 	}

+	// Recover stale mount points (common after reboots or NFS server restarts)
+	if err := recoverStaleMountPoint(mountPath); err != nil {
+		return nil, fmt.Errorf("failed to recover stale mount point %s: %w", mountPath, err)
+	}
+
 	// Ensure mount point exists
 	if err := os.MkdirAll(mountPath, 0755); err != nil {
 		return nil, fmt.Errorf("failed to create mount point: %w", err)
@@ -53,13 +59,55 @@ func NewNFSDestination(cfg *btypes.NFSConfig) (*NFSDestination, error) {

 		output, err := cmd.CombinedOutput()
 		if err != nil {
-			return nil, fmt.Errorf("failed to mount NFS share: %w, output: %s", err, string(output))
+			return nil, fmt.Errorf("failed to mount NFS share %s:%s at %s: %w, output: %s",
+				cfg.Server, cfg.Path, mountPath, err, string(output))
 		}
 	}

 	return dest, nil
 }

+// recoverStaleMountPoint detects and cleans up stale NFS mounts.
+// After a reboot or NFS server restart, the mount point can have a stale file handle
+// that causes "file exists" errors on mkdir and stat. Force-unmounting fixes this.
+func recoverStaleMountPoint(mountPath string) error {
+	_, err := os.Stat(mountPath)
+	if err == nil {
+		// Path is accessible, nothing to recover
+		return nil
+	}
+	if os.IsNotExist(err) {
+		// Doesn't exist yet, nothing to recover
+		return nil
+	}
+
+	// Path exists but is inaccessible (stale file handle, transport endpoint not connected, etc.)
+	slog.Info("detected stale mount, attempting recovery", "component", "nfs", "mountPath", mountPath, "error", err)
+
+	// Try lazy unmount first (always succeeds), then force unmount
+	for _, flags := range [][]string{{"-l"}, {"-f"}} {
+		args := append([]string{"umount"}, flags...)
+		args = append(args, mountPath)
+		cmd := exec.Command("sudo", args...)
+		if output, umountErr := cmd.CombinedOutput(); umountErr != nil {
+			slog.Error("umount failed", "component", "nfs", "flags", flags, "mountPath", mountPath, "error", umountErr, "output", strings.TrimSpace(string(output)))
+		} else {
+			slog.Info("successfully unmounted stale mount", "component", "nfs", "mountPath", mountPath)
+			// After unmount, the directory might still exist but should be accessible now
+			if _, statErr := os.Stat(mountPath); statErr == nil || os.IsNotExist(statErr) {
+				return nil
+			}
+		}
+	}
+
+	// Last resort: remove and recreate the mount point
+	if rmErr := os.Remove(mountPath); rmErr != nil {
+		return fmt.Errorf("stale mount at %s could not be recovered (unmount and remove both failed): %w", mountPath, err)
+	}
+	slog.Info("removed stale mount point, will recreate", "component", "nfs", "mountPath", mountPath)
+	return nil
+}
+
 // Put uploads data to NFS, returns size written
 func (n *NFSDestination) Put(key string, reader io.Reader) (int64, error) {
 	fullPath := filepath.Join(n.mountPath, key)
@@ -185,4 +233,4 @@ func (n *NFSDestination) Cleanup() error {
 		}
 	}
 	return nil
-}
+}
--- a/api/internal/backup/destinations/s3.go
+++ b/api/internal/backup/destinations/s3.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"log/slog"
 	"time"

 	"github.com/aws/aws-sdk-go-v2/aws"
@@ -59,15 +60,16 @@ func (s *S3Destination) Put(key string, reader io.Reader) (int64, error) {
 	fullKey := s.getFullKey(key)

 	// Use S3 manager for efficient multipart uploads
-	uploader := manager.NewUploader(s.client, func(u *manager.Uploader) {
+	// TODO: migrate to feature/s3/transfermanager when stable
+	uploader := manager.NewUploader(s.client, func(u *manager.Uploader) { //nolint:staticcheck
 		u.PartSize = 10 * 1024 * 1024 // 10MB parts
-		u.Concurrency = 3              // Limited concurrency for Raspberry Pi
+		u.Concurrency = 3             // Limited concurrency for Raspberry Pi
 	})

 	// Create a custom reader that tracks bytes read
 	trackingReader := &sizeTrackingReader{reader: reader}

-	result, err := uploader.Upload(context.Background(), &s3.PutObjectInput{
+	result, err := uploader.Upload(context.Background(), &s3.PutObjectInput{ //nolint:staticcheck
 		Bucket: aws.String(s.bucket),
 		Key:    aws.String(fullKey),
 		Body:   trackingReader,
@@ -78,7 +80,7 @@ func (s *S3Destination) Put(key string, reader io.Reader) (int64, error) {
 	}

 	// Log the ETag for verification
-	fmt.Printf("Uploaded to S3: %s (ETag: %s)\n", fullKey, *result.ETag)
+	slog.Info("uploaded to S3", "component", "s3", "key", fullKey, "etag", *result.ETag)

 	return trackingReader.bytesRead, nil
 }
@@ -195,4 +197,4 @@ func (r *sizeTrackingReader) Read(p []byte) (int, error) {
 	n, err := r.reader.Read(p)
 	r.bytesRead += int64(n)
 	return n, err
-}
+}
--- a/api/internal/backup/retention.go
+++ b/api/internal/backup/retention.go
@@ -2,7 +2,7 @@ package backup

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"time"

 	btypes "github.com/wild-cloud/wild-central/daemon/internal/backup/types"
@@ -50,7 +50,7 @@ func EnforceRetention(mgr *Manager, instanceName, appName string, keepLast, keep

 		// Both policies say delete
 		if err := mgr.DeleteAppBackup(instanceName, appName, plan.Timestamp); err != nil {
-			log.Printf("Retention: failed to delete backup %s/%s/%s: %v", instanceName, appName, plan.Timestamp, err)
+			slog.Error("failed to delete backup", "component", "backup", "instance", instanceName, "app", appName, "timestamp", plan.Timestamp, "error", err)
 			continue
 		}
 		deleted++
--- a/api/internal/backup/retention_test.go
+++ b/api/internal/backup/retention_test.go
@@ -137,7 +137,9 @@ func TestEnforceRetention(t *testing.T) {

 	// Create instance config with local destination
 	instanceDir := filepath.Join(tmpDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	config := map[string]any{
 		"backup": map[string]any{
@@ -150,10 +152,14 @@ func TestEnforceRetention(t *testing.T) {
 		},
 	}
 	configData, _ := yaml.Marshal(config)
-	os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
+	if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	backupDir := filepath.Join(instanceDir, "backups", appName)
-	os.MkdirAll(backupDir, 0755)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Create test backup plans with different timestamps
 	now := time.Now().UTC()
@@ -162,7 +168,7 @@ func TestEnforceRetention(t *testing.T) {
 		age    time.Duration
 		status string
 	}{
-		{now.Format("20060102T150405Z"), 0, "backed_up"},                                  // newest
+		{now.Format("20060102T150405Z"), 0, "backed_up"},                                   // newest
 		{now.Add(-24 * time.Hour).Format("20060102T150405Z"), 24 * time.Hour, "backed_up"}, // 1 day old
 		{now.Add(-48 * time.Hour).Format("20060102T150405Z"), 48 * time.Hour, "backed_up"}, // 2 days old
 		{now.Add(-72 * time.Hour).Format("20060102T150405Z"), 72 * time.Hour, "backed_up"}, // 3 days old
@@ -171,7 +177,9 @@ func TestEnforceRetention(t *testing.T) {

 	for _, ts := range timestamps {
 		planDir := filepath.Join(backupDir, ts.ts)
-		os.MkdirAll(planDir, 0755)
+		if err := os.MkdirAll(planDir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		plan := btypes.RecoveryPlan{
 			App:       appName,
@@ -180,7 +188,9 @@ func TestEnforceRetention(t *testing.T) {
 			Status:    ts.status,
 		}
 		planData, _ := yaml.Marshal(plan)
-		os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
+		if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	mgr := NewManager(tmpDir)
@@ -212,7 +222,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
 	appName := "test-app"

 	instanceDir := filepath.Join(tmpDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	config := map[string]any{
 		"backup": map[string]any{
@@ -225,10 +237,14 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
 		},
 	}
 	configData, _ := yaml.Marshal(config)
-	os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
+	if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	backupDir := filepath.Join(instanceDir, "backups", appName)
-	os.MkdirAll(backupDir, 0755)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	now := time.Now().UTC()
 	backups := []struct {
@@ -243,7 +259,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {

 	for _, b := range backups {
 		planDir := filepath.Join(backupDir, b.ts)
-		os.MkdirAll(planDir, 0755)
+		if err := os.MkdirAll(planDir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		plan := btypes.RecoveryPlan{
 			App:       appName,
@@ -252,7 +270,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
 			Status:    b.status,
 		}
 		planData, _ := yaml.Marshal(plan)
-		os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
+		if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	mgr := NewManager(tmpDir)
@@ -281,7 +301,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 	appName := "test-app"

 	instanceDir := filepath.Join(tmpDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	config := map[string]any{
 		"backup": map[string]any{
@@ -294,10 +316,14 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 		},
 	}
 	configData, _ := yaml.Marshal(config)
-	os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
+	if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	backupDir := filepath.Join(instanceDir, "backups", appName)
-	os.MkdirAll(backupDir, 0755)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	now := time.Now().UTC()
 	// 5 backups: newest, 1h old, 2h old, 3h old, 25h old
@@ -306,7 +332,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 	for _, offset := range timestamps {
 		ts := now.Add(-offset).Format("20060102T150405Z")
 		planDir := filepath.Join(backupDir, ts)
-		os.MkdirAll(planDir, 0755)
+		if err := os.MkdirAll(planDir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		plan := btypes.RecoveryPlan{
 			App:       appName,
@@ -315,7 +343,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 			Status:    "backed_up",
 		}
 		planData, _ := yaml.Marshal(plan)
-		os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
+		if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	mgr := NewManager(tmpDir)
--- a/api/internal/backup/scheduler.go
+++ b/api/internal/backup/scheduler.go
@@ -3,7 +3,7 @@ package backup
 import (
 	"context"
 	"fmt"
-	"log"
+	"log/slog"
 	"sync"
 	"time"

@@ -36,7 +36,7 @@ func (s *Scheduler) Start() {
 	s.cancel = cancel

 	go s.loop(ctx)
-	log.Println("Backup scheduler started")
+	slog.Info("backup scheduler started", "component", "scheduler")
 }

 // Stop shuts down the scheduler
@@ -112,26 +112,36 @@ func (s *Scheduler) runSchedule(instanceName string, sched *btypes.BackupSchedul
 		s.mu.Unlock()
 	}()

-	log.Printf("Scheduler: running backup for %s/%s (schedule: %s)", instanceName, sched.TargetName, sched.Name)
+	slog.Info("running scheduled backup", "component", "scheduler", "instance", instanceName, "target", sched.TargetName, "schedule", sched.Name)

 	mgr := NewManager(s.dataDir)

-	if sched.TargetType == "app" {
-		_, err := mgr.BackupApp(instanceName, sched.TargetName)
-		if err != nil {
-			log.Printf("Scheduler: backup failed for %s/%s: %v", instanceName, sched.TargetName, err)
-		} else {
-			// Enforce retention after successful backup
-			keepLast, keepDays := RetentionFromSchedule(sched, config.Retention)
-			deleted, retErr := EnforceRetention(mgr, instanceName, sched.TargetName, keepLast, keepDays)
-			if retErr != nil {
-				log.Printf("Scheduler: retention enforcement failed for %s/%s: %v", instanceName, sched.TargetName, retErr)
-			} else if deleted > 0 {
-				log.Printf("Scheduler: retention cleaned up %d old backups for %s/%s", deleted, instanceName, sched.TargetName)
-			}
+	var backupErr error
+	var retentionTarget string
+
+	switch sched.TargetType {
+	case "app":
+		retentionTarget = sched.TargetName
+		_, backupErr = mgr.BackupApp(instanceName, sched.TargetName)
+	case "cluster":
+		retentionTarget = "_cluster"
+		_, backupErr = mgr.BackupClusterConfig(instanceName)
+	default:
+		slog.Error("unknown schedule target type", "component", "scheduler", "instance", instanceName, "targetType", sched.TargetType)
+		return
+	}
+
+	if backupErr != nil {
+		slog.Error("scheduled backup failed", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "error", backupErr)
+	} else {
+		keepLast, keepDays := RetentionFromSchedule(sched, config.Retention)
+		deleted, retErr := EnforceRetention(mgr, instanceName, retentionTarget, keepLast, keepDays)
+		if retErr != nil {
+			slog.Error("retention enforcement failed", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "error", retErr)
+		} else if deleted > 0 {
+			slog.Info("retention cleaned up old backups", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "deleted", deleted)
 		}
 	}
-	// TODO: cluster backup support

 	// Update lastRun and nextRun
 	now := time.Now()
@@ -144,7 +154,7 @@ func (s *Scheduler) runSchedule(instanceName string, sched *btypes.BackupSchedul

 func (s *Scheduler) saveSchedules(instanceName string, config *BackupConfiguration) {
 	if err := SaveInstanceBackupSchedules(s.dataDir, instanceName, config.Schedules); err != nil {
-		log.Printf("Scheduler: failed to save schedules for %s: %v", instanceName, err)
+		slog.Error("failed to save schedules", "component", "scheduler", "instance", instanceName, "error", err)
 	}
 }

--- a/api/internal/backup/scheduler_test.go
+++ b/api/internal/backup/scheduler_test.go
@@ -20,7 +20,7 @@ func TestParseTime(t *testing.T) {
 		{"14:30", 14, 30},
 		{"00:00", 0, 0},
 		{"23:59", 23, 59},
-		{"", 2, 0},       // default
+		{"", 2, 0},        // default
 		{"invalid", 2, 0}, // default
 		{"25:00", 25, 0},  // parses but invalid hour (not our concern here)
 	}
@@ -148,10 +148,14 @@ func TestSaveInstanceBackupSchedules(t *testing.T) {
 	dataDir := t.TempDir()
 	instanceName := "test-instance"
 	instanceDir := filepath.Join(dataDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	configPath := filepath.Join(instanceDir, "config.yaml")
-	os.WriteFile(configPath, []byte("cloud:\n  domain: test.local\n"), 0644)
+	if err := os.WriteFile(configPath, []byte("cloud:\n  domain: test.local\n"), 0644); err != nil {
+		t.Fatal(err)
+	}

 	now := time.Now()
 	schedules := []BackupSchedule{
@@ -180,7 +184,9 @@ func TestSaveInstanceBackupSchedules(t *testing.T) {
 	}

 	var root map[string]interface{}
-	yaml.Unmarshal(data, &root)
+	if err := yaml.Unmarshal(data, &root); err != nil {
+		t.Fatalf("Unmarshal error = %v", err)
+	}

 	// Verify cloud.domain is preserved
 	cloud, ok := root["cloud"].(map[string]interface{})
--- a/api/internal/backup/strategies/config.go
+++ b/api/internal/backup/strategies/config.go
@@ -425,7 +425,7 @@ func (c *ConfigStrategy) mergeConfig(reader io.Reader, instancePath, appName str

 	var config map[string]interface{}
 	if data, err := os.ReadFile(configPath); err == nil {
-		yaml.Unmarshal(data, &config)
+		_ = yaml.Unmarshal(data, &config)
 	}
 	if config == nil {
 		config = make(map[string]interface{})
@@ -461,7 +461,7 @@ func (c *ConfigStrategy) mergeSecrets(reader io.Reader, instancePath, appName st

 	var secrets map[string]interface{}
 	if data, err := os.ReadFile(secretsPath); err == nil {
-		yaml.Unmarshal(data, &secrets)
+		_ = yaml.Unmarshal(data, &secrets)
 	}
 	if secrets == nil {
 		secrets = make(map[string]interface{})
--- a/api/internal/backup/strategies/longhorn_native.go
+++ b/api/internal/backup/strategies/longhorn_native.go
@@ -1,9 +1,8 @@
 package strategies

 import (
-	"bytes"
-	"encoding/json"
 	"fmt"
+	"log/slog"
 	"os/exec"
 	"strings"
 	"time"
@@ -32,30 +31,6 @@ func (l *LonghornNativeStrategy) Name() string {
 	return "longhorn-native"
 }

-// LonghornBackup represents a Longhorn Backup CRD
-type LonghornBackup struct {
-	APIVersion string `json:"apiVersion"`
-	Kind       string `json:"kind"`
-	Metadata   struct {
-		Name      string            `json:"name"`
-		Namespace string            `json:"namespace"`
-		Labels    map[string]string `json:"labels"`
-	} `json:"metadata"`
-	Spec struct {
-		SnapshotName string            `json:"snapshotName"`
-		Labels       map[string]string `json:"labels"`
-	} `json:"spec"`
-	Status struct {
-		State           string            `json:"state"`
-		Progress        int               `json:"progress"`
-		URL             string            `json:"url"`
-		VolumeSize      string            `json:"volumeSize"`
-		VolumeCreatedAt string            `json:"volumeCreatedAt"`
-		Messages        map[string]string `json:"messages"`
-		Error           string            `json:"error"`
-	} `json:"status"`
-}
-
 // Backup creates Longhorn native backups of all PVCs for an app, writing results to the plan
 func (l *LonghornNativeStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.BackupDestination) error {
 	entry := plan.GetStrategyEntry("longhorn-native")
@@ -129,7 +104,9 @@ func (l *LonghornNativeStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.B
 			"backupURL": backupURL,
 		})

-		l.cleanupOldBackups(kubeconfigPath, volumeName, backupID)
+		if err := l.cleanupOldBackups(kubeconfigPath, volumeName, backupID); err != nil {
+			slog.Error("failed to clean up old backups", "component", "longhorn", "volume", volumeName, "error", err)
+		}
 	}

 	// Record in plan
@@ -164,11 +141,6 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 		return nil
 	}

-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return fmt.Errorf("failed to get Longhorn API endpoint: %w", err)
-	}
-
 	restoreVolumes := []map[string]any{}

 	for _, bv := range backupVolumes {
@@ -205,7 +177,7 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 		// Create colored restore volume name
 		restoreVolumeName := fmt.Sprintf("%s-%s", pvcName, plan.StandbyColor)

-		if err := l.createVolumeFromBackup(kubeconfigPath, apiURL, restoreVolumeName, backupURL, pvcSize); err != nil {
+		if err := l.createVolumeFromBackup(kubeconfigPath, restoreVolumeName, backupURL, pvcSize); err != nil {
 			return fmt.Errorf("failed to create volume from backup for %s: %w", pvcName, err)
 		}

@@ -215,7 +187,7 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 			standbyNamespace = plan.App + "-" + plan.StandbyColor
 		}
 		if err := l.createPVForVolume(kubeconfigPath, restoreVolumeName, pvcSize, accessMode, standbyNamespace, pvcName); err != nil {
-			fmt.Printf("Warning: failed to create PV for volume %s: %v\n", restoreVolumeName, err)
+			slog.Error("failed to create PV for volume", "component", "longhorn", "volume", restoreVolumeName, "error", err)
 		}

 		restoreVolumes = append(restoreVolumes, map[string]any{
@@ -287,27 +259,33 @@ func (l *LonghornNativeStrategy) Verify(plan *btypes.RecoveryPlan, dest btypes.B
 		return nil
 	}

-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return fmt.Errorf("failed to get Longhorn API endpoint: %w", err)
+	// Verify backup target is accessible
+	if err := l.checkBackupTarget(kubeconfigPath); err != nil {
+		return fmt.Errorf("backup target not accessible: %w", err)
 	}

+	// Verify each backup CRD still exists
 	for _, bv := range backupVolumes {
 		backup, ok := bv.(map[string]any)
 		if !ok {
 			continue
 		}

-		backupURL, _ := backup["backupURL"].(string)
-		if backupURL == "" {
+		backupID, _ := backup["backupID"].(string)
+		if backupID == "" {
 			continue
 		}

-		url := fmt.Sprintf("%s/v1/volumes", apiURL)
-		cmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", url)
+		cmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupID,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.state}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)
+
 		output, err := cmd.Output()
-		if err != nil || string(output) != "200" {
-			return fmt.Errorf("Longhorn API not accessible")
+		if err != nil {
+			return fmt.Errorf("backup %s not found: %w", backupID, err)
+		}
+		if string(output) != "Completed" {
+			return fmt.Errorf("backup %s is not in Completed state: %s", backupID, string(output))
 		}
 	}

@@ -322,7 +300,7 @@ func (l *LonghornNativeStrategy) backupVolumeWithRetry(kubeconfigPath, appName,
 		snapshotName := strings.ToLower(fmt.Sprintf("%s-%s-snapshot-%s", appName, pvcName, timestamp))
 		if attempt > 0 {
 			snapshotName = strings.ToLower(fmt.Sprintf("%s-%s-snapshot-%s-retry%d", appName, pvcName, timestamp, attempt))
-			fmt.Printf("Retrying backup for volume %s (attempt %d/%d)...\n", volumeName, attempt+1, maxAttempts)
+			slog.Info("retrying backup for volume", "component", "longhorn", "volume", volumeName, "attempt", attempt+1, "maxAttempts", maxAttempts)
 			time.Sleep(10 * time.Second)
 		}

@@ -418,147 +396,117 @@ func (l *LonghornNativeStrategy) getVolumeNameFromPVC(kubeconfigPath, namespace,
 	return volumeName, nil
 }

-func (l *LonghornNativeStrategy) getLonghornAPIEndpoint(kubeconfigPath string) (string, error) {
-	checkCmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "http://localhost:8080/v1/volumes")
-	if err := checkCmd.Run(); err == nil {
-		return "http://localhost:8080", nil
-	}
-
-	cmd := exec.Command("kubectl", "port-forward", "-n", "longhorn-system", "service/longhorn-frontend", "8080:80")
-	tools.WithKubeconfig(cmd, kubeconfigPath)
-
-	if err := cmd.Start(); err != nil {
-		return "", fmt.Errorf("failed to start port-forward: %w", err)
-	}
-
-	time.Sleep(3 * time.Second)
-
-	verifyCmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "http://localhost:8080/v1/volumes")
-	if err := verifyCmd.Run(); err != nil {
-		return "", fmt.Errorf("port-forward not responding after setup: %w", err)
-	}
-
-	return "http://localhost:8080", nil
-}
-
 func (l *LonghornNativeStrategy) createSnapshot(kubeconfigPath, volumeName, snapshotName string) error {
-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return err
+	snapshotYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
+kind: Snapshot
+metadata:
+  name: %s
+  namespace: longhorn-system
+spec:
+  volume: %s
+  createSnapshot: true
+`, snapshotName, volumeName)
+
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(snapshotYAML)
+
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to create snapshot: %w, output: %s", err, string(output))
 	}

-	url := fmt.Sprintf("%s/v1/volumes/%s?action=snapshotCreate", apiURL, volumeName)
-	payload := fmt.Sprintf(`{"name":"%s"}`, snapshotName)
+	// Wait for snapshot to be ready
+	for range 30 {
+		cmd := exec.Command("kubectl", "get", "snapshots.longhorn.io", snapshotName,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.readyToUse}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)

-	cmd := exec.Command("curl", "-X", "POST", url,
-		"-H", "Content-Type: application/json",
-		"-d", payload, "-s")
-
-	if err := cmd.Run(); err != nil {
-		return fmt.Errorf("failed to create snapshot: %w", err)
+		output, err := cmd.Output()
+		if err == nil && string(output) == "true" {
+			return nil
+		}
+		time.Sleep(2 * time.Second)
 	}

-	time.Sleep(2 * time.Second)
-	return nil
+	return fmt.Errorf("timeout waiting for snapshot %s to be ready", snapshotName)
 }

 func (l *LonghornNativeStrategy) createBackup(kubeconfigPath, volumeName, snapshotName string) (string, error) {
-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return "", err
+	// Backup name must be unique — derive from snapshot name
+	backupName := strings.ReplaceAll(snapshotName, "_", "-")
+	if len(backupName) > 63 {
+		backupName = backupName[:63]
 	}

-	url := fmt.Sprintf("%s/v1/volumes/%s?action=snapshotBackup", apiURL, volumeName)
-	payload := fmt.Sprintf(`{"name":"%s"}`, snapshotName)
+	backupYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
+kind: Backup
+metadata:
+  name: %s
+  namespace: longhorn-system
+  labels:
+    backup-volume: %s
+spec:
+  snapshotName: %s
+`, backupName, volumeName, snapshotName)

-	cmd := exec.Command("curl", "-X", "POST", url,
-		"-H", "Content-Type: application/json",
-		"-d", payload, "-s")
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(backupYAML)

-	output, err := cmd.Output()
-	if err != nil {
-		return "", fmt.Errorf("failed to create backup: %w", err)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return "", fmt.Errorf("failed to create backup: %w, output: %s", err, string(output))
 	}

-	var response map[string]any
-	if err := json.Unmarshal(output, &response); err != nil {
-		return "", fmt.Errorf("failed to parse backup response: %w", err)
-	}
-
-	if backupStatus, ok := response["backupStatus"].([]any); ok {
-		// Find the backup entry matching our snapshot
-		for _, bs := range backupStatus {
-			if status, ok := bs.(map[string]any); ok {
-				if snap, _ := status["snapshot"].(string); snap == snapshotName {
-					if id, ok := status["id"].(string); ok {
-						return id, nil
-					}
-				}
-			}
-		}
-		// Fallback: find any entry without an error (new backup in progress)
-		for _, bs := range backupStatus {
-			if status, ok := bs.(map[string]any); ok {
-				if errMsg, _ := status["error"].(string); errMsg == "" {
-					if id, ok := status["id"].(string); ok {
-						return id, nil
-					}
-				}
-			}
-		}
-	}
-
-	return "", fmt.Errorf("backup ID not found in response for snapshot %s", snapshotName)
+	return backupName, nil
 }

-func (l *LonghornNativeStrategy) waitForBackupComplete(kubeconfigPath, volumeName, backupID string) (string, error) {
-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return "", err
-	}
-
+func (l *LonghornNativeStrategy) waitForBackupComplete(kubeconfigPath, _, backupName string) (string, error) {
 	maxRetries := 120
-	for range maxRetries {
-		url := fmt.Sprintf("%s/v1/volumes/%s", apiURL, volumeName)
-		cmd := exec.Command("curl", "-s", url)
+	for i := range maxRetries {
+		// Get backup state
+		stateCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.state}")
+		tools.WithKubeconfig(stateCmd, kubeconfigPath)

-		output, err := cmd.Output()
+		stateOutput, err := stateCmd.Output()
 		if err != nil {
 			time.Sleep(5 * time.Second)
 			continue
 		}

-		var volume map[string]any
-		if err := json.Unmarshal(output, &volume); err != nil {
-			time.Sleep(5 * time.Second)
-			continue
+		state := string(stateOutput)
+
+		if state == "Error" {
+			// Get error message
+			errCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
+				"-n", "longhorn-system", "-o", "jsonpath={.status.messages}")
+			tools.WithKubeconfig(errCmd, kubeconfigPath)
+			errOutput, _ := errCmd.Output()
+			return "", fmt.Errorf("backup failed: %s", string(errOutput))
 		}

-		if backupStatus, ok := volume["backupStatus"].([]any); ok {
-			for _, status := range backupStatus {
-				if s, ok := status.(map[string]any); ok {
-					if id, _ := s["id"].(string); id == backupID {
-						if state, _ := s["state"].(string); state == "Completed" {
-							if backupURL, ok := s["backupURL"].(string); ok && backupURL != "" {
-								return backupURL, nil
-							}
-							return l.getBackupURL(volumeName, backupID)
-						}
-						if errorMsg, _ := s["error"].(string); errorMsg != "" {
-							return "", fmt.Errorf("backup failed: %s", errorMsg)
-						}
-					}
-				}
+		if state == "Completed" {
+			// Get backup URL
+			urlCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
+				"-n", "longhorn-system", "-o", "jsonpath={.status.url}")
+			tools.WithKubeconfig(urlCmd, kubeconfigPath)
+
+			urlOutput, err := urlCmd.Output()
+			if err != nil {
+				return "", fmt.Errorf("backup completed but failed to get URL: %w", err)
+			}
+			backupURL := string(urlOutput)
+			if backupURL != "" {
+				return backupURL, nil
 			}
 		}

+		if i%12 == 0 && i > 0 {
+			slog.Info("waiting for backup to complete", "component", "longhorn", "backup", backupName, "state", state, "attempt", i)
+		}
 		time.Sleep(5 * time.Second)
 	}
-	return "", fmt.Errorf("timeout waiting for backup to complete")
-}
-
-func (l *LonghornNativeStrategy) getBackupURL(volumeName, backupID string) (string, error) {
-	return fmt.Sprintf("backup://%s/%s", volumeName, backupID), nil
+	return "", fmt.Errorf("timeout waiting for backup %s to complete", backupName)
 }

 func (l *LonghornNativeStrategy) createPVForVolume(kubeconfigPath, volumeName, size, accessMode, namespace, pvcName string) error {
@@ -592,9 +540,7 @@ spec:
 	return nil
 }

-func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, apiURL, volumeName, backupURL, size string) error {
-	url := fmt.Sprintf("%s/v1/volumes", apiURL)
-
+func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, volumeName, backupURL, size string) error {
 	sizeBytes := "1073741824"
 	if strings.HasSuffix(size, "Gi") {
 		var sizeInt int
@@ -603,62 +549,62 @@ func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, apiURL,
 		}
 	}

-	payload := fmt.Sprintf(`{
-		"name": "%s",
-		"size": "%s",
-		"fromBackup": "%s",
-		"numberOfReplicas": 3
-	}`, volumeName, sizeBytes, backupURL)
+	volumeYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
+kind: Volume
+metadata:
+  name: %s
+  namespace: longhorn-system
+spec:
+  size: "%s"
+  fromBackup: "%s"
+  numberOfReplicas: 3
+  frontend: blockdev
+  accessMode: rwo
+`, volumeName, sizeBytes, backupURL)

-	cmd := exec.Command("curl", "-X", "POST", url,
-		"-H", "Content-Type: application/json",
-		"-d", payload, "-s")
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(volumeYAML)

-	var stdout, stderr bytes.Buffer
-	cmd.Stdout = &stdout
-	cmd.Stderr = &stderr
-
-	if err := cmd.Run(); err != nil {
-		return fmt.Errorf("failed to create volume from backup: %w, stderr: %s, stdout: %s", err, stderr.String(), stdout.String())
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to create volume from backup: %w, output: %s", err, string(output))
 	}

-	return l.waitForVolume(kubeconfigPath, apiURL, volumeName)
+	return l.waitForVolume(kubeconfigPath, volumeName)
 }

-func (l *LonghornNativeStrategy) waitForVolume(_, apiURL, volumeName string) error {
+func (l *LonghornNativeStrategy) waitForVolume(kubeconfigPath, volumeName string) error {
 	maxRetries := 60
 	for i := range maxRetries {
-		url := fmt.Sprintf("%s/v1/volumes/%s", apiURL, volumeName)
-		cmd := exec.Command("curl", "-s", url)
+		cmd := exec.Command("kubectl", "get", "volumes.longhorn.io", volumeName,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.state},{.status.restoreInitiated},{.status.robustness}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)

 		output, err := cmd.Output()
 		if err == nil {
-			var volume map[string]any
-			if err := json.Unmarshal(output, &volume); err == nil {
-				if state, _ := volume["state"].(string); state == "detached" || state == "attached" {
-					if restoreStatus, ok := volume["restoreStatus"].([]any); ok && len(restoreStatus) > 0 {
-						for _, rs := range restoreStatus {
-							if status, ok := rs.(map[string]any); ok {
-								if isRestored, _ := status["isRestored"].(bool); isRestored {
-									return nil
-								}
-							}
-						}
-					} else {
-						if robustness, _ := volume["robustness"].(string); robustness == "healthy" || robustness == "unknown" {
-							return nil
-						}
+			parts := strings.Split(string(output), ",")
+			if len(parts) == 3 {
+				state := parts[0]
+				restoreInitiated := parts[1]
+				robustness := parts[2]
+
+				if state == "detached" || state == "attached" {
+					if restoreInitiated == "true" {
+						return nil
+					}
+					if robustness == "healthy" || robustness == "unknown" {
+						return nil
 					}
 				}
 			}
 		}

 		if i%12 == 0 {
-			fmt.Printf("Waiting for volume %s to be ready... (%d/%d)\n", volumeName, i, maxRetries)
+			slog.Info("waiting for volume to be ready", "component", "longhorn", "volume", volumeName, "attempt", i, "maxRetries", maxRetries)
 		}
 		time.Sleep(5 * time.Second)
 	}
-	return fmt.Errorf("timeout waiting for volume to be ready")
+	return fmt.Errorf("timeout waiting for volume %s to be ready", volumeName)
 }

 func (l *LonghornNativeStrategy) cleanupOldBackups(_, _, _ string) error {
--- a/api/internal/backup/strategies/mysql.go
+++ b/api/internal/backup/strategies/mysql.go
@@ -94,7 +94,7 @@ func (m *MySQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.BackupDest

 	size, err := dest.Put(key, reader)
 	if err != nil {
-		cmd.Process.Kill()
+		_ = cmd.Process.Kill()
 		return fmt.Errorf("failed to upload backup: %w", err)
 	}

--- a/api/internal/backup/strategies/postgres.go
+++ b/api/internal/backup/strategies/postgres.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
 	"os/exec"
 	"strings"
@@ -90,7 +91,7 @@ func (p *PostgreSQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.Backu

 	size, err := dest.Put(key, reader)
 	if err != nil {
-		cmd.Process.Kill()
+		_ = cmd.Process.Kill()
 		return fmt.Errorf("failed to upload backup: %w", err)
 	}

@@ -101,7 +102,7 @@ func (p *PostgreSQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.Backu
 	// Also backup globals (users, roles, etc)
 	globalsKey := fmt.Sprintf("postgres/%s/%s/%s-globals.sql", plan.Instance, plan.App, plan.Timestamp)
 	if err := p.backupGlobals(kubeconfigPath, dest, globalsKey); err != nil {
-		fmt.Printf("Warning: failed to backup PostgreSQL globals: %v\n", err)
+		slog.Error("postgres globals backup failed", "component", "postgres", "error", err)
 		globalsKey = ""
 	}

@@ -165,7 +166,7 @@ func (p *PostgreSQLStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.Back
 		fmt.Sprintf("DROP DATABASE IF EXISTS %s", standbyDbName))
 	tools.WithKubeconfig(dropCmd, kubeconfigPath)
 	if output, err := dropCmd.CombinedOutput(); err != nil {
-		fmt.Printf("Warning: failed to drop database %s: %v, output: %s\n", standbyDbName, err, output)
+		slog.Error("failed to drop database", "component", "postgres", "database", standbyDbName, "error", err, "output", string(output))
 	}

 	// Create standby database
@@ -184,7 +185,7 @@ func (p *PostgreSQLStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.Back
 			fmt.Sprintf("GRANT ALL PRIVILEGES ON DATABASE %s TO %s", standbyDbName, dbUser))
 		tools.WithKubeconfig(grantCmd, kubeconfigPath)
 		if output, err := grantCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to grant privileges: %v, output: %s\n", err, output)
+			slog.Error("failed to grant privileges", "component", "postgres", "error", err, "output", string(output))
 		}
 	}

@@ -232,7 +233,7 @@ ALTER SCHEMA public OWNER TO %s;`, dbUser, dbUser, dbUser, dbUser)
 			"psql", "-U", "postgres", "-d", standbyDbName, "-c", ownershipSQL)
 		tools.WithKubeconfig(ownerCmd, kubeconfigPath)
 		if output, err := ownerCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to transfer ownership: %v, output: %s\n", err, output)
+			slog.Error("failed to transfer ownership", "component", "postgres", "error", err, "output", string(output))
 		}
 	}

@@ -289,7 +290,7 @@ func (p *PostgreSQLStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
 		"psql", "-U", "postgres", "-d", "postgres", "-c",
 		fmt.Sprintf("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '%s' AND pid <> pg_backend_pid()", previousDbName))
 	tools.WithKubeconfig(terminateCmd, kubeconfigPath)
-	terminateCmd.CombinedOutput() // best effort
+	_, _ = terminateCmd.CombinedOutput() // best effort

 	// Drop the old database
 	dropCmd := exec.Command("kubectl", "exec", "-n", "postgres", podName, "--",
@@ -363,7 +364,7 @@ func (p *PostgreSQLStrategy) backupGlobals(kubeconfigPath string, dest btypes.Ba
 	}()

 	if _, err := dest.Put(key, reader); err != nil {
-		cmd.Process.Kill()
+		_ = cmd.Process.Kill()
 		return err
 	}

@@ -388,6 +389,11 @@ func (p *PostgreSQLStrategy) getDatabaseName(instanceName, appName string) strin
 			if dbName, ok := appConfig["dbName"].(string); ok && dbName != "" {
 				return dbName
 			}
+			if db, ok := appConfig["db"].(map[string]interface{}); ok {
+				if dbName, ok := db["name"].(string); ok && dbName != "" {
+					return dbName
+				}
+			}
 		}
 	}

@@ -415,6 +421,11 @@ func (p *PostgreSQLStrategy) getAppUser(instanceName, appName string) string {
 			if dbUsername, ok := appConfig["dbUsername"].(string); ok && dbUsername != "" {
 				return dbUsername
 			}
+			if db, ok := appConfig["db"].(map[string]interface{}); ok {
+				if dbUser, ok := db["user"].(string); ok && dbUser != "" {
+					return dbUser
+				}
+			}
 		}
 	}

--- a/api/internal/backup/strategies/postgres_test.go
+++ b/api/internal/backup/strategies/postgres_test.go
@@ -3,6 +3,7 @@ package strategies
 import (
 	"bytes"
 	"io"
+	"os"
 	"strings"
 	"testing"
 	"time"
@@ -163,11 +164,151 @@ func TestPostgreSQLStrategy_Verify(t *testing.T) {
 	}
 }

-func TestPostgreSQLStrategy_GetDatabaseInfo(t *testing.T) {
-	s := &PostgreSQLStrategy{
-		dataDir: "/test/data",
+func TestPostgreSQLStrategy_GetDatabaseName(t *testing.T) {
+	tests := []struct {
+		name     string
+		config   string
+		appName  string
+		expected string
+	}{
+		{
+			name: "flat dbName key",
+			config: `apps:
+  myapp:
+    dbName: my_database
+`,
+			appName:  "myapp",
+			expected: "my_database",
+		},
+		{
+			name: "nested db.name key",
+			config: `apps:
+  e2e-test-app:
+    namespace: e2e-test-app
+    db:
+      host: postgres
+      name: e2e_test_app
+      user: e2e_test_app
+`,
+			appName:  "e2e-test-app",
+			expected: "e2e_test_app",
+		},
+		{
+			name: "flat key takes precedence over nested",
+			config: `apps:
+  myapp:
+    dbName: flat_name
+    db:
+      name: nested_name
+`,
+			appName:  "myapp",
+			expected: "flat_name",
+		},
+		{
+			name: "no config falls back to appName",
+			config: `apps:
+  myapp:
+    namespace: myapp
+`,
+			appName:  "myapp",
+			expected: "myapp",
+		},
+		{
+			name:     "missing app falls back to appName",
+			config:   `apps: {}`,
+			appName:  "missing-app",
+			expected: "missing-app",
+		},
 	}

-	assert.NotNil(t, s)
-	assert.Equal(t, "/test/data", s.dataDir)
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpDir := t.TempDir()
+			instanceDir := tmpDir + "/instances/test-instance"
+			err := os.MkdirAll(instanceDir, 0755)
+			assert.NoError(t, err)
+			err = os.WriteFile(instanceDir+"/config.yaml", []byte(tt.config), 0644)
+			assert.NoError(t, err)
+
+			s := &PostgreSQLStrategy{dataDir: tmpDir}
+			result := s.getDatabaseName("test-instance", tt.appName)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestPostgreSQLStrategy_GetAppUser(t *testing.T) {
+	tests := []struct {
+		name     string
+		config   string
+		appName  string
+		expected string
+	}{
+		{
+			name: "flat dbUser key",
+			config: `apps:
+  myapp:
+    dbUser: my_user
+`,
+			appName:  "myapp",
+			expected: "my_user",
+		},
+		{
+			name: "flat dbUsername key",
+			config: `apps:
+  myapp:
+    dbUsername: my_username
+`,
+			appName:  "myapp",
+			expected: "my_username",
+		},
+		{
+			name: "nested db.user key",
+			config: `apps:
+  e2e-test-app:
+    namespace: e2e-test-app
+    db:
+      host: postgres
+      name: e2e_test_app
+      user: e2e_test_app
+`,
+			appName:  "e2e-test-app",
+			expected: "e2e_test_app",
+		},
+		{
+			name: "flat key takes precedence over nested",
+			config: `apps:
+  myapp:
+    dbUser: flat_user
+    db:
+      user: nested_user
+`,
+			appName:  "myapp",
+			expected: "flat_user",
+		},
+		{
+			name: "no user config falls back to appName",
+			config: `apps:
+  myapp:
+    namespace: myapp
+`,
+			appName:  "myapp",
+			expected: "myapp",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpDir := t.TempDir()
+			instanceDir := tmpDir + "/instances/test-instance"
+			err := os.MkdirAll(instanceDir, 0755)
+			assert.NoError(t, err)
+			err = os.WriteFile(instanceDir+"/config.yaml", []byte(tt.config), 0644)
+			assert.NoError(t, err)
+
+			s := &PostgreSQLStrategy{dataDir: tmpDir}
+			result := s.getAppUser("test-instance", tt.appName)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
 }
--- a/api/internal/backup/types/types.go
+++ b/api/internal/backup/types/types.go
@@ -33,6 +33,7 @@ type RecoveryPlan struct {
 	App          string               `yaml:"app"          json:"app"`
 	Instance     string               `yaml:"instance"     json:"instance"`
 	Timestamp    string               `yaml:"timestamp"    json:"timestamp"`
+	Version      string               `yaml:"version"      json:"version,omitempty"`
 	Status       string               `yaml:"status"       json:"status"` // backing_up, backed_up, restoring, restored, switching, switched, cleaning_up, cleaned_up, failed
 	Error        string               `yaml:"error"        json:"error,omitempty"`
 	Source       RecoverySource       `yaml:"source"       json:"source"`
@@ -141,8 +142,8 @@ type BackupInfo struct {

 // ComponentBackup represents a single backup component (legacy, kept for compatibility)
 type ComponentBackup struct {
-	Type     string                 `json:"type"`     // "postgres", "mysql", "pvc", "config"
-	Name     string                 `json:"name"`     // Component identifier
+	Type     string                 `json:"type"` // "postgres", "mysql", "pvc", "config"
+	Name     string                 `json:"name"` // Component identifier
 	Size     int64                  `json:"size"`
 	Location string                 `json:"location"` // Path in destination
 	Metadata map[string]interface{} `json:"metadata"`
@@ -175,28 +176,28 @@ type ProgressCallback func(progress int, message string)

 // BackupConfiguration represents instance-level backup configuration
 type BackupConfiguration struct {
-	Destination  DestinationConfig  `yaml:"destination"`
-	Retention    RetentionPolicy    `yaml:"retention"`
-	Schedules    []BackupSchedule   `yaml:"schedules,omitempty"`
-	Verification VerificationConfig `yaml:"verification"`
+	Destination  DestinationConfig  `yaml:"destination"          json:"destination"`
+	Retention    RetentionPolicy    `yaml:"retention"            json:"retention"`
+	Schedules    []BackupSchedule   `yaml:"schedules,omitempty"  json:"schedules,omitempty"`
+	Verification VerificationConfig `yaml:"verification"         json:"verification"`
 }

 // BackupSchedule defines a per-app or cluster backup schedule
 type BackupSchedule struct {
-	ID         string           `yaml:"id"         json:"id"`
-	Name       string           `yaml:"name"       json:"name"`
-	TargetType string           `yaml:"targetType" json:"target_type"` // "app" or "cluster"
-	TargetName string           `yaml:"targetName" json:"target_name"`
-	Frequency  string           `yaml:"frequency"  json:"frequency"` // "daily", "weekly", "monthly"
-	Time       string           `yaml:"time"       json:"time"`      // "HH:MM" local time
-	DayOfWeek  int              `yaml:"dayOfWeek"  json:"day_of_week,omitempty"` // 0=Sun..6=Sat (weekly)
-	DayOfMonth int              `yaml:"dayOfMonth" json:"day_of_month,omitempty"` // 1-28 (monthly)
+	ID         string             `yaml:"id"         json:"id"`
+	Name       string             `yaml:"name"       json:"name"`
+	TargetType string             `yaml:"targetType" json:"target_type"` // "app" or "cluster"
+	TargetName string             `yaml:"targetName" json:"target_name"`
+	Frequency  string             `yaml:"frequency"  json:"frequency"`              // "daily", "weekly", "monthly"
+	Time       string             `yaml:"time"       json:"time"`                   // "HH:MM" local time
+	DayOfWeek  int                `yaml:"dayOfWeek"  json:"day_of_week,omitempty"`  // 0=Sun..6=Sat (weekly)
+	DayOfMonth int                `yaml:"dayOfMonth" json:"day_of_month,omitempty"` // 1-28 (monthly)
 	Retention  *ScheduleRetention `yaml:"retention,omitempty" json:"retention,omitempty"`
-	Enabled    bool             `yaml:"enabled"    json:"enabled"`
-	LastRun    *time.Time       `yaml:"lastRun,omitempty"  json:"last_run,omitempty"`
-	NextRun    *time.Time       `yaml:"nextRun,omitempty"  json:"next_run,omitempty"`
-	CreatedAt  time.Time        `yaml:"createdAt"  json:"created_at"`
-	UpdatedAt  time.Time        `yaml:"updatedAt"  json:"updated_at"`
+	Enabled    bool               `yaml:"enabled"    json:"enabled"`
+	LastRun    *time.Time         `yaml:"lastRun,omitempty"  json:"last_run,omitempty"`
+	NextRun    *time.Time         `yaml:"nextRun,omitempty"  json:"next_run,omitempty"`
+	CreatedAt  time.Time          `yaml:"createdAt"  json:"created_at"`
+	UpdatedAt  time.Time          `yaml:"updatedAt"  json:"updated_at"`
 }

 // ScheduleRetention overrides the instance-level retention for a specific schedule
@@ -207,53 +208,53 @@ type ScheduleRetention struct {

 // DestinationConfig configures where backups are stored
 type DestinationConfig struct {
-	Type  string       `yaml:"type"` // "s3", "azure", "nfs", "local"
-	S3    *S3Config    `yaml:"s3,omitempty"`
-	Azure *AzureConfig `yaml:"azure,omitempty"`
-	NFS   *NFSConfig   `yaml:"nfs,omitempty"`
-	Local *LocalConfig `yaml:"local,omitempty"`
+	Type  string       `yaml:"type"  json:"type"` // "s3", "azure", "nfs", "local"
+	S3    *S3Config    `yaml:"s3,omitempty"    json:"s3,omitempty"`
+	Azure *AzureConfig `yaml:"azure,omitempty" json:"azure,omitempty"`
+	NFS   *NFSConfig   `yaml:"nfs,omitempty"   json:"nfs,omitempty"`
+	Local *LocalConfig `yaml:"local,omitempty" json:"local,omitempty"`
 }

 // S3Config configures S3 backup destination
 type S3Config struct {
-	Bucket         string `yaml:"bucket"`
-	Region         string `yaml:"region"`
-	Endpoint       string `yaml:"endpoint,omitempty"` // For S3-compatible services
-	AccessKeyID    string `yaml:"-"`                  // Loaded from secrets.yaml
-	SecretAccessKey string `yaml:"-"`                 // Loaded from secrets.yaml
+	Bucket          string `yaml:"bucket"             json:"bucket"`
+	Region          string `yaml:"region"             json:"region"`
+	Endpoint        string `yaml:"endpoint,omitempty" json:"endpoint,omitempty"` // For S3-compatible services
+	AccessKeyID     string `yaml:"-"                  json:"-"`                  // Loaded from secrets.yaml
+	SecretAccessKey string `yaml:"-"                 json:"-"`                   // Loaded from secrets.yaml
 }

 // AzureConfig configures Azure Blob Storage destination
 type AzureConfig struct {
-	Container      string `yaml:"container"`
-	StorageAccount string `yaml:"storageAccount"`
-	AccessKey      string `yaml:"-"` // Loaded from secrets.yaml
+	Container      string `yaml:"container"       json:"container"`
+	StorageAccount string `yaml:"storageAccount"  json:"storageAccount"`
+	AccessKey      string `yaml:"-"               json:"-"` // Loaded from secrets.yaml
 }

 // NFSConfig configures NFS backup destination
 type NFSConfig struct {
-	Server       string `yaml:"server"`
-	Path         string `yaml:"path"`
-	MountPoint   string `yaml:"mountPoint,omitempty"`
-	MountOptions string `yaml:"mountOptions,omitempty"`
+	Server       string `yaml:"server"               json:"server"`
+	Path         string `yaml:"path"                 json:"path"`
+	MountPoint   string `yaml:"mountPoint,omitempty"  json:"mountPoint,omitempty"`
+	MountOptions string `yaml:"mountOptions,omitempty" json:"mountOptions,omitempty"`
 }

 // LocalConfig configures local filesystem backup destination
 type LocalConfig struct {
-	Path string `yaml:"path"`
+	Path string `yaml:"path" json:"path"`
 }

 // RetentionPolicy defines how long to keep backups
 type RetentionPolicy struct {
-	Daily   int `yaml:"daily"`
-	Weekly  int `yaml:"weekly"`
-	Monthly int `yaml:"monthly"`
-	Yearly  int `yaml:"yearly"`
+	Daily   int `yaml:"daily"   json:"daily"`
+	Weekly  int `yaml:"weekly"  json:"weekly"`
+	Monthly int `yaml:"monthly" json:"monthly"`
+	Yearly  int `yaml:"yearly"  json:"yearly"`
 }

 // VerificationConfig configures backup verification
 type VerificationConfig struct {
-	Enabled      bool   `yaml:"enabled"`
-	Schedule     string `yaml:"schedule"`     // Cron expression
-	RandomSample bool   `yaml:"randomSample"` // Test random backup each time
+	Enabled      bool   `yaml:"enabled"      json:"enabled"`
+	Schedule     string `yaml:"schedule"     json:"schedule"`     // Cron expression
+	RandomSample bool   `yaml:"randomSample" json:"randomSample"` // Test random backup each time
 }
--- a/api/internal/cluster/cluster.go
+++ b/api/internal/cluster/cluster.go
@@ -4,7 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -80,6 +80,8 @@ func (m *Manager) GenerateConfig(instanceName string, config *ClusterConfig) err
 		return nil
 	}

+	slog.Info("generating cluster config", "component", "cluster", "instance", instanceName, "cluster", config.ClusterName, "vip", config.VIP)
+
 	// Ensure generated directory exists
 	if err := storage.EnsureDir(generatedDir, 0755); err != nil {
 		return fmt.Errorf("failed to create generated directory: %w", err)
@@ -117,9 +119,12 @@ func (m *Manager) Bootstrap(instanceName, nodeName string) (string, error) {
 		return "", fmt.Errorf("failed to start bootstrap operation: %w", err)
 	}

+	slog.Info("starting cluster bootstrap", "component", "cluster", "instance", instanceName, "node", nodeName, "operationId", opID)
+
 	// Run bootstrap asynchronously
 	go func() {
 		if err := m.runBootstrapWithTracking(instanceName, nodeName, opID); err != nil {
+			slog.Error("cluster bootstrap failed", "component", "cluster", "instance", instanceName, "node", nodeName, "error", err)
 			_ = m.opsMgr.Update(instanceName, opID, "failed", err.Error(), 0)
 		}
 	}()
@@ -191,6 +196,7 @@ func (m *Manager) runBootstrapWithTracking(instanceName, nodeName, opID string)
 	}

 	// Mark as completed
+	slog.Info("cluster bootstrap completed", "component", "cluster", "instance", instanceName)
 	_ = m.opsMgr.Update(instanceName, opID, "completed", "Bootstrap completed successfully", 100)
 	return nil
 }
@@ -385,7 +391,7 @@ func (m *Manager) retrieveKubeconfigFromCluster(instanceName, nodeIP string, tim
 		tools.WithTalosconfig(cmdKubeconfig, talosconfigPath)

 		if output, err := cmdKubeconfig.CombinedOutput(); err == nil {
-			log.Printf("Successfully retrieved kubeconfig for instance %s", instanceName)
+			slog.Info("kubeconfig retrieved", "component", "cluster", "instance", instanceName)
 			return nil
 		} else {
 			// Check if we've exceeded deadline
@@ -424,13 +430,15 @@ func (m *Manager) RegenerateKubeconfig(instanceName string) error {
 		return fmt.Errorf("control plane VIP not configured in cluster.nodes.control.vip")
 	}

-	log.Printf("Regenerating kubeconfig for instance %s from cluster VIP %s", instanceName, vip)
+	slog.Info("regenerating kubeconfig", "component", "cluster", "instance", instanceName, "vip", vip)
 	// Use shorter timeout for manual regeneration (cluster should already be running)
 	return m.retrieveKubeconfigFromCluster(instanceName, vip, 30*time.Second)
 }

 // ConfigureEndpoints updates talosconfig to use VIP and retrieves kubeconfig
 func (m *Manager) ConfigureEndpoints(instanceName string, includeNodes bool) error {
+	slog.Info("configuring cluster endpoints", "component", "cluster", "instance", instanceName, "includeNodes", includeNodes)
+
 	configPath := tools.GetInstanceConfigPath(m.dataDir, instanceName)
 	talosconfigPath := tools.GetTalosconfigPath(m.dataDir, instanceName)

@@ -709,6 +717,8 @@ func (m *Manager) Reset(instanceName string, confirm bool) error {
 		return fmt.Errorf("reset requires confirmation")
 	}

+	slog.Info("resetting cluster", "component", "cluster", "instance", instanceName)
+
 	// This is a destructive operation
 	// Real implementation would:
 	// 1. Reset all nodes via talosctl reset
--- a/api/internal/config/manager.go
+++ b/api/internal/config/manager.go
@@ -2,7 +2,7 @@ package config

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"path/filepath"

 	"github.com/wild-cloud/wild-central/daemon/internal/network"
@@ -41,12 +41,11 @@ func (m *Manager) EnsureGlobalConfig(dataDir string) error {
 	// Detect network configuration
 	netInfo, err := network.DetectNetworkInfo()
 	if err != nil {
-		log.Printf("Warning: Could not detect network info, using empty defaults: %v", err)
+		slog.Info("network detection failed, using defaults", "component", "config", "error", err)
 	} else {
 		// Set detected values
 		initialConfig.Cloud.Router.IP = netInfo.Gateway
-		log.Printf("Detected network: Gateway=%s, Interface=%s",
-			netInfo.Gateway, netInfo.PrimaryInterface)
+		slog.Info("detected network", "component", "config", "gateway", netInfo.Gateway, "interface", netInfo.PrimaryInterface)
 	}

 	// Ensure data directory exists
--- a/api/internal/config/manager_test.go
+++ b/api/internal/config/manager_test.go
@@ -13,11 +13,8 @@ import (
 // Test: NewManager creates manager successfully
 func TestNewManager(t *testing.T) {
 	m := NewManager()
-	if m == nil {
-		t.Fatal("NewManager returned nil")
-	}
-	if m.yq == nil {
-		t.Error("Manager.yq is nil")
+	if m == nil || m.yq == nil {
+		t.Fatal("NewManager returned nil or Manager.yq is nil")
 	}
 }

--- a/api/internal/data/paths.go
+++ b/api/internal/data/paths.go
@@ -2,7 +2,7 @@ package data

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"path/filepath"
 )
@@ -42,10 +42,10 @@ func (m *Manager) Initialize() error {
 		} else {
 			dataDir = filepath.Join(cwd, "data")
 		}
-		log.Printf("Running in development mode, using data directory: %s", dataDir)
+		slog.Info("data directory configured", "component", "data", "mode", "development", "path", dataDir)
 	} else {
 		dataDir = "/var/lib/wild-central"
-		log.Printf("Running in production mode, using data directory: %s", dataDir)
+		slog.Info("data directory configured", "component", "data", "mode", "production", "path", dataDir)
 	}

 	m.dataDir = dataDir
@@ -60,7 +60,7 @@ func (m *Manager) Initialize() error {
 		}
 	}

-	log.Printf("Data directory structure initialized at: %s", dataDir)
+	slog.Info("data directory initialized", "component", "data", "path", dataDir)
 	return nil
 }

--- a/api/internal/discovery/discovery.go
+++ b/api/internal/discovery/discovery.go
@@ -3,6 +3,7 @@ package discovery
 import (
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net"
 	"os"
 	"path/filepath"
@@ -111,6 +112,8 @@ func (m *Manager) StartDiscovery(instanceName string, ipList []string) error {
 		return err
 	}

+	slog.Info("starting node discovery", "component", "discovery", "instance", instanceName, "addresses", len(ipList))
+
 	// Start discovery in background
 	go m.runDiscovery(instanceName, ipList)

@@ -173,6 +176,8 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
 		_ = m.writeDiscoveryStatus(instanceName, status)
 		m.discoveryMu.Unlock()
 	}
+
+	slog.Info("node discovery completed", "component", "discovery", "instance", instanceName, "found", len(discoveredNodes))
 }

 // probeNode attempts to detect if a node is running Talos in maintenance mode
--- a/api/internal/dnsmasq/config.go
+++ b/api/internal/dnsmasq/config.go
@@ -2,7 +2,7 @@ package dnsmasq

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"os/exec"
 	"strconv"
@@ -39,7 +39,7 @@ func (g *ConfigGenerator) Generate(cfg *config.GlobalConfig, clouds []config.Ins
 	// Get the Wild Central IP address
 	dnsIP, err := network.GetWildCentralIP()
 	if err != nil {
-		log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
+		slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
 		// Fall back to empty string if detection fails
 		dnsIP = ""
 	}
@@ -49,7 +49,7 @@ func (g *ConfigGenerator) Generate(cfg *config.GlobalConfig, clouds []config.Ins
 		// Point cloud domains to the cluster load balancer IP
 		loadBalancerIP := cloud.Cluster.LoadBalancerIp
 		if loadBalancerIP == "" {
-			log.Printf("Warning: No load balancer IP configured for instance %s, adding commented DNS config", cloud.Cluster.Name)
+			slog.Info("no load balancer IP configured, adding commented DNS config", "component", "dnsmasq", "instance", cloud.Cluster.Name)
 			// Add commented out entries for instances without load balancer
 			resolution_section += fmt.Sprintf("# No load balancer IP configured for instance %s\n", cloud.Cluster.Name)
 			resolution_section += fmt.Sprintf("# local=/%s/\n# address=/%s/<load-balancer-ip>\n", cloud.Cloud.InternalDomain, cloud.Cloud.InternalDomain)
@@ -92,7 +92,7 @@ log-dhcp
 func (g *ConfigGenerator) WriteConfig(cfg *config.GlobalConfig, clouds []config.InstanceConfig, configPath string) error {
 	configContent := g.Generate(cfg, clouds)

-	log.Printf("Writing dnsmasq config to: %s", configPath)
+	slog.Info("writing dnsmasq config", "component", "dnsmasq", "path", configPath)

 	if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
 		return fmt.Errorf("writing dnsmasq config: %w", err)
@@ -109,6 +109,7 @@ func (g *ConfigGenerator) RestartService() error {
 	if err != nil {
 		return fmt.Errorf("failed to restart dnsmasq: %w (output: %s)", err, string(output))
 	}
+	slog.Info("dnsmasq service restarted", "component", "dnsmasq")
 	return nil
 }

@@ -127,7 +128,7 @@ func (g *ConfigGenerator) GetStatus() (*ServiceStatus, error) {
 	// Get the Wild Central IP address
 	dnsIP, err := network.GetWildCentralIP()
 	if err != nil {
-		log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
+		slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
 		dnsIP = ""
 	}

@@ -201,7 +202,7 @@ func (g *ConfigGenerator) UpdateConfig(cfg *config.GlobalConfig, instances []con
 	configContent := g.Generate(cfg, instances)

 	// Write config
-	log.Printf("Writing dnsmasq config to: %s", g.configPath)
+	slog.Info("writing dnsmasq config", "component", "dnsmasq", "path", g.configPath)
 	if err := os.WriteFile(g.configPath, []byte(configContent), 0644); err != nil {
 		return fmt.Errorf("writing dnsmasq config: %w", err)
 	}
@@ -234,12 +235,12 @@ func (g *ConfigGenerator) ConfigureSystemDNS() error {
 		return fmt.Errorf("failed to write resolved.conf: %w", err)
 	}

-	log.Printf("Configured systemd-resolved to use DNS at %s", dnsIP)
+	slog.Info("configured systemd-resolved", "component", "dnsmasq", "dnsIP", dnsIP)

 	// Restart systemd-resolved to apply changes (via polkit)
 	cmd := exec.Command("systemctl", "restart", "systemd-resolved")
 	if output, err := cmd.CombinedOutput(); err != nil {
-		log.Printf("Warning: Failed to restart systemd-resolved: %v (output: %s)", err, string(output))
+		slog.Error("failed to restart systemd-resolved", "component", "dnsmasq", "error", err, "output", string(output))
 		// Don't return error - the config was written successfully
 	}

--- a/api/internal/dnsmasq/config_modular.go
+++ b/api/internal/dnsmasq/config_modular.go
@@ -2,7 +2,7 @@ package dnsmasq

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -22,7 +22,7 @@ func (g *ConfigGenerator) GenerateMainConfig(cfg *config.GlobalConfig) string {
 	// Get the Wild Central IP address
 	dnsIP, err := network.GetWildCentralIP()
 	if err != nil {
-		log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
+		slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
 		// Fall back to empty string if detection fails
 		dnsIP = ""
 	}
@@ -60,25 +60,25 @@ log-dhcp
 func (g *ConfigGenerator) GenerateInstanceConfig(instance config.InstanceConfig) string {
 	var sb strings.Builder

-	sb.WriteString(fmt.Sprintf("# DNS configuration for instance: %s\n", instance.Cluster.Name))
-	sb.WriteString(fmt.Sprintf("# Generated by Wild Cloud\n\n"))
+	fmt.Fprintf(&sb, "# DNS configuration for instance: %s\n", instance.Cluster.Name)
+	sb.WriteString("# Generated by Wild Cloud\n\n")

 	loadBalancerIP := instance.Cluster.LoadBalancerIp
 	if loadBalancerIP == "" {
-		sb.WriteString(fmt.Sprintf("# WARNING: No load balancer IP configured for this instance\n"))
-		sb.WriteString(fmt.Sprintf("# DNS entries are commented out until load balancer IP is configured\n\n"))
-		sb.WriteString(fmt.Sprintf("# local=/%s/\n", instance.Cloud.InternalDomain))
-		sb.WriteString(fmt.Sprintf("# address=/%s/<load-balancer-ip>\n\n", instance.Cloud.InternalDomain))
-		sb.WriteString(fmt.Sprintf("# address=/%s/<load-balancer-ip>\n", instance.Cloud.Domain))
+		sb.WriteString("# WARNING: No load balancer IP configured for this instance\n")
+		sb.WriteString("# DNS entries are commented out until load balancer IP is configured\n\n")
+		fmt.Fprintf(&sb, "# local=/%s/\n", instance.Cloud.InternalDomain)
+		fmt.Fprintf(&sb, "# address=/%s/<load-balancer-ip>\n\n", instance.Cloud.InternalDomain)
+		fmt.Fprintf(&sb, "# address=/%s/<load-balancer-ip>\n", instance.Cloud.Domain)
 	} else {
 		// Internal domain (.internal.cloud.example.tld) - local only, no external DNS
-		sb.WriteString(fmt.Sprintf("# Internal domain (LAN-only)\n"))
-		sb.WriteString(fmt.Sprintf("local=/%s/\n", instance.Cloud.InternalDomain))
-		sb.WriteString(fmt.Sprintf("address=/%s/%s\n\n", instance.Cloud.InternalDomain, loadBalancerIP))
+		sb.WriteString("# Internal domain (LAN-only)\n")
+		fmt.Fprintf(&sb, "local=/%s/\n", instance.Cloud.InternalDomain)
+		fmt.Fprintf(&sb, "address=/%s/%s\n\n", instance.Cloud.InternalDomain, loadBalancerIP)

 		// External domain (cloud.example.tld) - resolve to load balancer IP
-		sb.WriteString(fmt.Sprintf("# Public domain (resolved locally to avoid external DNS)\n"))
-		sb.WriteString(fmt.Sprintf("address=/%s/%s\n", instance.Cloud.Domain, loadBalancerIP))
+		sb.WriteString("# Public domain (resolved locally to avoid external DNS)\n")
+		fmt.Fprintf(&sb, "address=/%s/%s\n", instance.Cloud.Domain, loadBalancerIP)
 	}

 	return sb.String()
@@ -129,7 +129,7 @@ func (g *ConfigGenerator) WriteInstanceConfig(instanceName string, instance conf
 		return fmt.Errorf("installing instance config: %w", err)
 	}

-	log.Printf("Successfully wrote instance DNS config: %s", instanceFile)
+	slog.Info("wrote instance DNS config", "component", "dnsmasq", "path", instanceFile)
 	return nil
 }

@@ -151,7 +151,9 @@ func (g *ConfigGenerator) ValidateWithInstance(instanceConfigPath string) error
 	tempMainConfig := filepath.Join(tempDir, "main.conf")
 	// Modify the conf-dir line to point to our temp instance dir
 	tempInstanceDir := filepath.Join(tempDir, "instances")
-	os.MkdirAll(tempInstanceDir, 0755)
+	if err := os.MkdirAll(tempInstanceDir, 0755); err != nil {
+		return fmt.Errorf("creating temp instance dir: %w", err)
+	}

 	modifiedContent := strings.ReplaceAll(
 		string(mainContent),
@@ -184,7 +186,7 @@ func (g *ConfigGenerator) RemoveInstanceConfig(instanceName string) error {

 	// Check if file exists
 	if _, err := os.Stat(instanceFile); os.IsNotExist(err) {
-		log.Printf("Instance DNS config does not exist: %s", instanceFile)
+		slog.Info("instance DNS config does not exist", "component", "dnsmasq", "path", instanceFile)
 		return nil // Not an error, already removed
 	}

@@ -193,7 +195,7 @@ func (g *ConfigGenerator) RemoveInstanceConfig(instanceName string) error {
 		return fmt.Errorf("removing instance config: %w", err)
 	}

-	log.Printf("Removed instance DNS config: %s", instanceFile)
+	slog.Info("removed instance DNS config", "component", "dnsmasq", "path", instanceFile)
 	return nil
 }

@@ -205,16 +207,16 @@ func (g *ConfigGenerator) ReloadService() error {
 	_, err := cmd.CombinedOutput()
 	if err != nil {
 		// If reload fails, try restart as fallback
-		log.Printf("Reload failed, attempting restart: %v", err)
+		slog.Error("reload failed, attempting restart", "component", "dnsmasq", "error", err)
 		return g.RestartService()
 	}
-	log.Printf("Successfully reloaded dnsmasq service")
+	slog.Info("dnsmasq service reloaded", "component", "dnsmasq")
 	return nil
 }

 // UpdateToModularConfig migrates from monolithic to modular configuration
 func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instanceNames []string, instances []config.InstanceConfig) error {
-	log.Printf("Migrating to modular dnsmasq configuration...")
+	slog.Info("migrating to modular configuration", "component", "dnsmasq")

 	// Ensure instance directory exists
 	if err := os.MkdirAll(instanceConfigDir, 0755); err != nil {
@@ -225,7 +227,7 @@ func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instan
 	for i, instance := range instances {
 		instanceName := instanceNames[i]
 		if err := g.WriteInstanceConfig(instanceName, instance); err != nil {
-			log.Printf("Warning: Failed to write instance config for %s: %v", instanceName, err)
+			slog.Error("failed to write instance config", "component", "dnsmasq", "instance", instanceName, "error", err)
 			// Continue with other instances
 		}
 	}
@@ -255,21 +257,21 @@ func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instan
 	// Install new config
 	if err := os.Rename(tempFile, g.configPath); err != nil {
 		// Try to restore backup
-		os.Rename(backupFile, g.configPath)
+		_ = os.Rename(backupFile, g.configPath)
 		return fmt.Errorf("installing new config: %w", err)
 	}

 	// Reload dnsmasq
 	if err := g.ReloadService(); err != nil {
 		// Try to restore backup and reload
-		log.Printf("Reload failed, attempting to restore backup...")
+		slog.Error("reload failed, restoring backup", "component", "dnsmasq")
 		os.Remove(g.configPath)
-		os.Rename(backupFile, g.configPath)
-		g.ReloadService()
+		_ = os.Rename(backupFile, g.configPath)
+		_ = g.ReloadService()
 		return fmt.Errorf("reloading with new config: %w", err)
 	}

-	log.Printf("Successfully migrated to modular dnsmasq configuration")
+	slog.Info("migrated to modular configuration", "component", "dnsmasq")
 	return nil
 }

@@ -286,6 +288,6 @@ func (g *ConfigGenerator) UpdateInstanceDNS(instanceName string, instance config
 		return fmt.Errorf("reloading dnsmasq: %w", err)
 	}

-	log.Printf("Successfully updated DNS for instance: %s", instanceName)
+	slog.Info("DNS updated for instance", "component", "dnsmasq", "instance", instanceName)
 	return nil
-}
+}
--- a/api/internal/factory/factory.go
+++ b/api/internal/factory/factory.go
@@ -269,7 +269,7 @@ func ParseVersion(v string) [3]int {
 		v = v[:idx]
 	}
 	var parts [3]int
-	fmt.Sscanf(v, "%d.%d.%d", &parts[0], &parts[1], &parts[2])
+	_, _ = fmt.Sscanf(v, "%d.%d.%d", &parts[0], &parts[1], &parts[2])
 	return parts
 }

--- a/api/internal/instance/instance.go
+++ b/api/internal/instance/instance.go
@@ -2,6 +2,7 @@ package instance

 import (
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -74,6 +75,8 @@ func (m *Manager) CreateInstance(name string) error {
 		return nil
 	}

+	slog.Info("creating instance", "component", "instance", "name", name)
+
 	// Acquire lock for instance creation
 	lockPath := tools.GetInstancesLockPath(m.dataDir)
 	return storage.WithLock(lockPath, func() error {
@@ -118,6 +121,8 @@ func (m *Manager) DeleteInstance(name string) error {
 		return fmt.Errorf("instance %s does not exist", name)
 	}

+	slog.Info("deleting instance", "component", "instance", "name", name)
+
 	// Clear context if this is the current instance
 	currentContext, err := m.contextMgr.GetCurrentContext()
 	if err == nil && currentContext == name {
--- a/api/internal/logging/console.go
+++ b/api/internal/logging/console.go
@@ -0,0 +1,138 @@
+package logging
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"slices"
+	"sync"
+)
+
+// ANSI color codes
+const (
+	dim    = "\033[2m"
+	red    = "\033[31m"
+	yellow = "\033[33m"
+	cyan   = "\033[36m"
+	reset  = "\033[0m"
+)
+
+// ConsoleHandler formats log output for human readability on terminals.
+// It produces compact, color-coded lines:
+//
+//	20:15:54 INF daemon started addr=:5055
+//	20:15:54 ERR backup failed component=backup error="connection refused"
+type ConsoleHandler struct {
+	w     io.Writer
+	level slog.Leveler
+	attrs []slog.Attr
+	mu    *sync.Mutex
+}
+
+// NewConsoleHandler creates a handler that writes human-friendly colored logs.
+func NewConsoleHandler(w io.Writer, opts *slog.HandlerOptions) *ConsoleHandler {
+	level := slog.LevelInfo
+	if opts != nil && opts.Level != nil {
+		level = opts.Level.Level()
+	}
+	return &ConsoleHandler{
+		w:     w,
+		level: level,
+		mu:    &sync.Mutex{},
+	}
+}
+
+func (h *ConsoleHandler) Enabled(_ context.Context, level slog.Level) bool {
+	return level >= h.level.Level()
+}
+
+func (h *ConsoleHandler) Handle(_ context.Context, r slog.Record) error {
+	// Time
+	buf := []byte(dim + r.Time.Format("15:04:05") + reset + " ")
+
+	// Level badge
+	switch {
+	case r.Level >= slog.LevelError:
+		buf = append(buf, red+"ERR"+reset+" "...)
+	case r.Level >= slog.LevelWarn:
+		buf = append(buf, yellow+"WRN"+reset+" "...)
+	default:
+		buf = append(buf, cyan+"INF"+reset+" "...)
+	}
+
+	// Message
+	buf = append(buf, r.Message...)
+
+	// Pre-set attrs (from slog.With)
+	for _, a := range h.attrs {
+		buf = appendAttr(buf, a)
+	}
+
+	// Inline attrs
+	r.Attrs(func(a slog.Attr) bool {
+		buf = appendAttr(buf, a)
+		return true
+	})
+
+	buf = append(buf, '\n')
+
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	_, err := h.w.Write(buf)
+	return err
+}
+
+func (h *ConsoleHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
+	return &ConsoleHandler{
+		w:     h.w,
+		level: h.level,
+		attrs: append(slices.Clone(h.attrs), attrs...),
+		mu:    h.mu,
+	}
+}
+
+func (h *ConsoleHandler) WithGroup(name string) slog.Handler {
+	// Groups are rare in this codebase; treat as a prefixed attr set
+	return &ConsoleHandler{
+		w:     h.w,
+		level: h.level,
+		attrs: append(slices.Clone(h.attrs), slog.String("group", name)),
+		mu:    h.mu,
+	}
+}
+
+func appendAttr(buf []byte, a slog.Attr) []byte {
+	if a.Equal(slog.Attr{}) {
+		return buf
+	}
+	v := a.Value.Resolve()
+	buf = append(buf, ' ')
+	buf = append(buf, dim...)
+	buf = append(buf, a.Key...)
+	buf = append(buf, '=')
+	buf = append(buf, reset...)
+
+	s := v.String()
+	if needsQuote(s) {
+		buf = append(buf, fmt.Sprintf("%q", s)...)
+	} else {
+		buf = append(buf, s...)
+	}
+	return buf
+}
+
+func needsQuote(s string) bool {
+	if s == "" {
+		return true
+	}
+	for _, c := range s {
+		if c <= ' ' || c == '"' || c == '\\' {
+			return true
+		}
+	}
+	return false
+}
+
+// Verify interface compliance at compile time.
+var _ slog.Handler = (*ConsoleHandler)(nil)
--- a/api/internal/node/node.go
+++ b/api/internal/node/node.go
@@ -3,10 +3,12 @@ package node
 import (
 	"context"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"

 	"github.com/wild-cloud/wild-central/daemon/internal/config"
@@ -172,6 +174,8 @@ func (m *Manager) Get(instanceName, hostname string) (*Node, error) {

 // Add registers a new node in config.yaml
 func (m *Manager) Add(instanceName string, node *Node) error {
+	slog.Info("adding node", "component", "node", "instance", instanceName, "hostname", node.Hostname, "role", node.Role)
+
 	instancePath := m.GetInstancePath(instanceName)

 	// Validate node data
@@ -263,6 +267,8 @@ func (m *Manager) Add(instanceName string, node *Node) error {
 // Delete removes a node from config.yaml
 // If skipReset is false, the node will be reset before deletion (with 30s timeout)
 func (m *Manager) Delete(instanceName, nodeIdentifier string, skipReset bool) error {
+	slog.Info("deleting node", "component", "node", "instance", instanceName, "node", nodeIdentifier, "skipReset", skipReset)
+
 	// Get node to find hostname
 	node, err := m.Get(instanceName, nodeIdentifier)
 	if err != nil {
@@ -434,6 +440,8 @@ func (m *Manager) Apply(instanceName, nodeIdentifier string, opts ApplyOptions)
 		return fmt.Errorf("failed to update node status: %w", err)
 	}

+	slog.Info("applying node config", "component", "node", "instance", instanceName, "hostname", node.Hostname, "role", node.Role)
+
 	// Apply configuration to node
 	// Determine which IP to use and whether node is in maintenance mode
 	//
@@ -473,6 +481,7 @@ func (m *Manager) Apply(instanceName, nodeIdentifier string, opts ApplyOptions)
 		return fmt.Errorf("failed to update node status: %w", err)
 	}

+	slog.Info("node config applied", "component", "node", "instance", instanceName, "hostname", node.Hostname, "ip", node.TargetIP)
 	return nil
 }

@@ -723,8 +732,108 @@ func (m *Manager) FetchTemplates(instanceName string) error {
 	return m.extractEmbeddedTemplates(destDir)
 }

+// NodeHealth represents the health status of a node
+type NodeHealth struct {
+	Node        string               `json:"node"`
+	Services    []tools.ServiceStatus `json:"services"`
+	DmesgErrors []tools.DmesgError   `json:"dmesgErrors"`
+	Healthy     bool                 `json:"healthy"`
+}
+
+// Health checks node health by querying Talos service statuses and scanning dmesg for errors
+func (m *Manager) Health(instanceName, nodeIdentifier string) (*NodeHealth, error) {
+	node, err := m.Get(instanceName, nodeIdentifier)
+	if err != nil {
+		return nil, fmt.Errorf("node not found: %w", err)
+	}
+
+	if !node.Applied || node.Maintenance {
+		return nil, fmt.Errorf("health check requires an applied, non-maintenance node")
+	}
+
+	ip := node.TargetIP
+	if ip == "" {
+		return nil, fmt.Errorf("no IP address available for node %s", node.Hostname)
+	}
+
+	// Fetch services and dmesg concurrently
+	var services []tools.ServiceStatus
+	var dmesgRaw string
+	var svcErr, dmesgErr error
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	go func() {
+		defer wg.Done()
+		services, svcErr = m.talosctl.GetServices(ip)
+	}()
+
+	go func() {
+		defer wg.Done()
+		dmesgRaw, dmesgErr = m.talosctl.GetDmesg(ip)
+	}()
+
+	wg.Wait()
+
+	if svcErr != nil {
+		return nil, fmt.Errorf("failed to get services: %w", svcErr)
+	}
+
+	var dmesgErrors []tools.DmesgError
+	if dmesgErr == nil {
+		dmesgErrors = tools.ParseDmesgErrors(dmesgRaw)
+	}
+	if dmesgErrors == nil {
+		dmesgErrors = []tools.DmesgError{}
+	}
+
+	// Compute overall health
+	healthy := len(dmesgErrors) == 0
+	for _, svc := range services {
+		if !svc.Healthy && svc.HealthMessage != "" {
+			healthy = false
+			break
+		}
+	}
+
+	return &NodeHealth{
+		Node:        node.Hostname,
+		Services:    services,
+		DmesgErrors: dmesgErrors,
+		Healthy:     healthy,
+	}, nil
+}
+
+// Reboot restarts a node without wiping state
+func (m *Manager) Reboot(instanceName, nodeIdentifier string) error {
+	slog.Info("rebooting node", "component", "node", "instance", instanceName, "node", nodeIdentifier)
+
+	node, err := m.Get(instanceName, nodeIdentifier)
+	if err != nil {
+		return fmt.Errorf("node not found: %w", err)
+	}
+
+	rebootIP := node.TargetIP
+	if rebootIP == "" {
+		rebootIP = node.CurrentIP
+	}
+	if rebootIP == "" {
+		return fmt.Errorf("no IP address available for node %s", node.Hostname)
+	}
+
+	if err := m.talosctl.Reboot(rebootIP); err != nil {
+		return fmt.Errorf("failed to reboot node: %w", err)
+	}
+
+	slog.Info("node reboot initiated", "component", "node", "instance", instanceName, "hostname", node.Hostname, "ip", rebootIP)
+	return nil
+}
+
 // Reset resets a node to maintenance mode
 func (m *Manager) Reset(instanceName, nodeIdentifier string) error {
+	slog.Info("resetting node", "component", "node", "instance", instanceName, "node", nodeIdentifier)
+
 	// Get node
 	node, err := m.Get(instanceName, nodeIdentifier)
 	if err != nil {
--- a/api/internal/operations/operations.go
+++ b/api/internal/operations/operations.go
@@ -3,6 +3,7 @@ package operations
 import (
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"
 	"time"
@@ -72,8 +73,8 @@ type Operation struct {
 	Progress  int               `json:"progress"`          // 0-100
 	Details   *OperationDetails `json:"details,omitempty"` // Operation-specific details
 	LogFile   string            `json:"logFile,omitempty"` // Path to output log file
-	StartedAt time.Time  `json:"started_at"`
-	EndedAt   *time.Time `json:"ended_at,omitempty"`
+	StartedAt time.Time         `json:"started_at"`
+	EndedAt   *time.Time        `json:"ended_at,omitempty"`
 }

 // GetOperationsDir returns the operations directory for an instance
@@ -115,6 +116,8 @@ func (m *Manager) Start(instanceName, opType, target string) (string, error) {
 		return "", err
 	}

+	slog.Info("operation started", "component", "operations", "id", opID, "type", opType, "target", target, "instance", instanceName)
+
 	// Broadcast SSE event if manager is available
 	m.broadcastOperationEvent("operation:started", op)

@@ -164,6 +167,18 @@ func (m *Manager) Update(instanceName, opID, status, message string, progress in
 		return err
 	}

+	// Log terminal status transitions
+	if oldStatus != status {
+		switch status {
+		case "completed":
+			slog.Info("operation completed", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
+		case "failed":
+			slog.Error("operation failed", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName, "message", message)
+		case "cancelled":
+			slog.Info("operation cancelled", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
+		}
+	}
+
 	// Broadcast appropriate SSE event based on status change
 	if oldStatus != status {
 		switch status {
@@ -302,6 +317,26 @@ func (m *Manager) Delete(instanceName, opID string) error {
 	return os.Remove(opPath)
 }

+// FailOrphaned marks all running/pending operations for an instance as failed.
+// Called on API startup to clean up operations that were interrupted by a restart.
+func (m *Manager) FailOrphaned(instanceName string) error {
+	ops, err := m.List(instanceName)
+	if err != nil {
+		return err
+	}
+
+	for _, op := range ops {
+		if op.Status == "running" || op.Status == "pending" {
+			slog.Info("failing orphaned operation", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
+			if err := m.Update(instanceName, op.ID, "failed", "API restarted while operation was in progress", op.Progress); err != nil {
+				slog.Warn("failed to mark orphaned operation as failed", "component", "operations", "id", op.ID, "error", err)
+			}
+		}
+	}
+
+	return nil
+}
+
 // Cleanup removes old completed/failed operations
 func (m *Manager) Cleanup(instanceName string, olderThan time.Duration) error {
 	ops, err := m.List(instanceName)
--- a/api/internal/operations/operations_test.go
+++ b/api/internal/operations/operations_test.go
@@ -0,0 +1,90 @@
+package operations
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func setupTestManager(t *testing.T) (*Manager, string) {
+	t.Helper()
+	tmpDir := t.TempDir()
+	instanceName := "test-cloud"
+
+	// Create the instances/test-cloud/operations directory
+	opsDir := filepath.Join(tmpDir, "instances", instanceName, "operations")
+	if err := os.MkdirAll(opsDir, 0755); err != nil {
+		t.Fatalf("failed to create ops dir: %v", err)
+	}
+
+	return NewManager(tmpDir), instanceName
+}
+
+func TestFailOrphaned(t *testing.T) {
+	m, instanceName := setupTestManager(t)
+
+	// Create operations in various states
+	runningID, err := m.Start(instanceName, "backup", "myapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+	_ = m.Update(instanceName, runningID, "running", "Backing up", 50)
+
+	pendingID, err := m.Start(instanceName, "restore", "myapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+
+	completedID, err := m.Start(instanceName, "backup", "otherapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+	_ = m.Update(instanceName, completedID, "completed", "Done", 100)
+
+	failedID, err := m.Start(instanceName, "deploy", "otherapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+	_ = m.Update(instanceName, failedID, "failed", "Something broke", 0)
+
+	// Run FailOrphaned
+	if err := m.FailOrphaned(instanceName); err != nil {
+		t.Fatalf("FailOrphaned failed: %v", err)
+	}
+
+	// Running operation should now be failed
+	op, _ := m.GetByInstance(instanceName, runningID)
+	if op.Status != "failed" {
+		t.Errorf("expected running op to be failed, got %s", op.Status)
+	}
+	if op.EndedAt == nil {
+		t.Error("expected running op to have EndedAt set")
+	}
+
+	// Pending operation should now be failed
+	op, _ = m.GetByInstance(instanceName, pendingID)
+	if op.Status != "failed" {
+		t.Errorf("expected pending op to be failed, got %s", op.Status)
+	}
+
+	// Completed operation should be unchanged
+	op, _ = m.GetByInstance(instanceName, completedID)
+	if op.Status != "completed" {
+		t.Errorf("expected completed op to stay completed, got %s", op.Status)
+	}
+
+	// Failed operation should be unchanged
+	op, _ = m.GetByInstance(instanceName, failedID)
+	if op.Status != "failed" {
+		t.Errorf("expected already-failed op to stay failed, got %s", op.Status)
+	}
+}
+
+func TestFailOrphaned_NoOperations(t *testing.T) {
+	m, instanceName := setupTestManager(t)
+
+	// Should not error on empty operations directory
+	if err := m.FailOrphaned(instanceName); err != nil {
+		t.Fatalf("FailOrphaned on empty dir failed: %v", err)
+	}
+}
--- a/api/internal/pxe/pxe.go
+++ b/api/internal/pxe/pxe.go
@@ -4,6 +4,7 @@ import (
 	"crypto/sha256"
 	"fmt"
 	"io"
+	"log/slog"
 	"net/http"
 	"os"
 	"path/filepath"
@@ -145,6 +146,7 @@ func (m *Manager) DownloadAsset(instanceName, assetType, version, url string) er
 		return fmt.Errorf("failed to move file: %w", err)
 	}

+	slog.Info("PXE asset downloaded", "component", "pxe", "instance", instanceName, "type", assetType, "version", version)
 	return nil
 }

--- a/api/internal/secrets/secrets_test.go
+++ b/api/internal/secrets/secrets_test.go
@@ -90,11 +90,8 @@ func TestGenerateSecret_Uniqueness(t *testing.T) {
 // Test: NewManager creates manager successfully
 func TestNewManager(t *testing.T) {
 	m := NewManager()
-	if m == nil {
-		t.Fatal("NewManager returned nil")
-	}
-	if m.yq == nil {
-		t.Error("Manager.yq is nil")
+	if m == nil || m.yq == nil {
+		t.Fatal("NewManager returned nil or Manager.yq is nil")
 	}
 }

--- a/api/internal/sse/manager.go
+++ b/api/internal/sse/manager.go
@@ -4,7 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"sync"
 	"time"

@@ -42,7 +42,6 @@ type EventFilters struct {
 // Manager manages all SSE connections
 type Manager struct {
 	clients      map[string]map[string]*Client // instanceName -> clientID -> Client
-	register     chan *Client
 	unregister   chan *Client
 	broadcast    chan *Event
 	mu           sync.RWMutex
@@ -53,7 +52,6 @@ type Manager struct {
 func NewManager() *Manager {
 	m := &Manager{
 		clients:      make(map[string]map[string]*Client),
-		register:     make(chan *Client, 100),
 		unregister:   make(chan *Client, 100),
 		broadcast:    make(chan *Event, 1000),
 		rateLimiters: make(map[string]*rate.Limiter),
@@ -62,19 +60,10 @@ func NewManager() *Manager {
 	return m
 }

-// run processes client registration and event broadcasting
+// run processes client unregistration and event broadcasting
 func (m *Manager) run() {
 	for {
 		select {
-		case client := <-m.register:
-			m.mu.Lock()
-			if m.clients[client.InstanceName] == nil {
-				m.clients[client.InstanceName] = make(map[string]*Client)
-			}
-			m.clients[client.InstanceName][client.ID] = client
-			m.mu.Unlock()
-			log.Printf("SSE: Client %s registered for instance %s", client.ID, client.InstanceName)
-
 		case client := <-m.unregister:
 			m.mu.Lock()
 			if clients, ok := m.clients[client.InstanceName]; ok {
@@ -85,7 +74,7 @@ func (m *Manager) run() {
 			}
 			close(client.Channel)
 			m.mu.Unlock()
-			log.Printf("SSE: Client %s unregistered", client.ID)
+			slog.Info("client unregistered", "component", "sse", "client", client.ID)

 		case event := <-m.broadcast:
 			m.mu.RLock()
@@ -102,7 +91,7 @@ func (m *Manager) run() {
 					case client.Channel <- event:
 					default:
 						// Client channel full, skip
-						log.Printf("SSE: Client %s channel full, skipping event", client.ID)
+						slog.Info("client channel full, skipping event", "component", "sse", "client", client.ID)
 					}
 				}
 			}
@@ -114,7 +103,7 @@ func (m *Manager) run() {
 					case client.Channel <- event:
 					default:
 						// Client channel full, skip
-						log.Printf("SSE: Client %s channel full, skipping event", client.ID)
+						slog.Info("client channel full, skipping event", "component", "sse", "client", client.ID)
 					}
 				}
 			}
@@ -207,7 +196,14 @@ func (m *Manager) RegisterClient(instanceName string, filters EventFilters) *Cli
 		Cancel:       cancel,
 	}

-	m.register <- client
+	m.mu.Lock()
+	if m.clients[instanceName] == nil {
+		m.clients[instanceName] = make(map[string]*Client)
+	}
+	m.clients[instanceName][client.ID] = client
+	m.mu.Unlock()
+
+	slog.Info("client registered", "component", "sse", "client", client.ID, "instance", instanceName)
 	return client
 }

@@ -230,7 +226,7 @@ func (m *Manager) Broadcast(event *Event) {
 	select {
 	case m.broadcast <- event:
 	default:
-		log.Printf("SSE: Broadcast channel full, dropping event %s", event.ID)
+		slog.Error("broadcast channel full, dropping event", "component", "sse", "event", event.ID)
 	}
 }

@@ -269,4 +265,4 @@ func generateEventID() string {
 // JSON marshals the event to JSON
 func (e *Event) JSON() ([]byte, error) {
 	return json.Marshal(e)
-}
+}
--- a/api/internal/sse/manager_test.go
+++ b/api/internal/sse/manager_test.go
@@ -349,4 +349,4 @@ func BenchmarkBroadcast(b *testing.B) {
 	for _, client := range clients {
 		manager.UnregisterClient(client)
 	}
-}
+}
--- a/api/internal/sse/watchers.go
+++ b/api/internal/sse/watchers.go
@@ -5,7 +5,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"os/exec"
 	"strings"
 	"sync"
@@ -120,7 +120,7 @@ func (w *KubectlWatcher) Start() error {
 	w.wg.Add(1)
 	go w.watchResource("services", w.parseServiceEvent)

-	log.Printf("SSE: Started kubectl watchers for instance %s", w.instanceName)
+	slog.Info("started kubectl watchers", "component", "sse", "instance", w.instanceName)
 	return nil
 }

@@ -148,13 +148,13 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,

 		stdout, err := cmd.StdoutPipe()
 		if err != nil {
-			log.Printf("SSE: Failed to create stdout pipe for %s watch: %v", resourceType, err)
+			slog.Error("failed to create stdout pipe", "component", "sse", "resource", resourceType, "error", err)
 			w.handleWatchError(resourceType)
 			continue
 		}

 		if err := cmd.Start(); err != nil {
-			log.Printf("SSE: Failed to start %s watch: %v", resourceType, err)
+			slog.Error("failed to start watch", "component", "sse", "resource", resourceType, "error", err)
 			w.handleWatchError(resourceType)
 			continue
 		}
@@ -170,14 +170,14 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
 		}

 		if err := scanner.Err(); err != nil {
-			log.Printf("SSE: %s watch scanner error: %v", resourceType, err)
+			slog.Error("watch scanner error", "component", "sse", "resource", resourceType, "error", err)
 		}

-		cmd.Wait()
+		_ = cmd.Wait()

 		// If context not cancelled, restart after a delay
 		if w.ctx.Err() == nil {
-			log.Printf("SSE: Restarting %s watcher for instance %s", resourceType, w.instanceName)
+			slog.Info("restarting watcher", "component", "sse", "resource", resourceType, "instance", w.instanceName)
 			time.Sleep(5 * time.Second)
 		}
 	}
@@ -186,7 +186,7 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
 // parsePodEvent parses pod watch events
 func (w *KubectlWatcher) parsePodEvent(data []byte, resourceType string) {
 	var event struct {
-		Type   string `json:"type"`   // ADDED, MODIFIED, DELETED
+		Type   string `json:"type"` // ADDED, MODIFIED, DELETED
 		Object struct {
 			Metadata struct {
 				Name      string            `json:"name"`
@@ -503,7 +503,7 @@ func (w *KubectlWatcher) handleWatchError(resourceType string) {
 func (w *KubectlWatcher) Stop() {
 	w.cancel()
 	w.wg.Wait()
-	log.Printf("SSE: Stopped kubectl watchers for instance %s", w.instanceName)
+	slog.Info("stopped kubectl watchers", "component", "sse", "instance", w.instanceName)
 }

 // TalosWatcher watches Talos events using talosctl
@@ -532,7 +532,7 @@ func NewTalosWatcher(instanceName, talosconfig, nodeIP string, manager *Manager)
 // Start begins watching Talos events
 func (w *TalosWatcher) Start() error {
 	go w.watchEvents()
-	log.Printf("SSE: Started talos watcher for instance %s", w.instanceName)
+	slog.Info("started talos watcher", "component", "sse", "instance", w.instanceName)
 	return nil
 }

@@ -557,13 +557,13 @@ func (w *TalosWatcher) watchEvents() {

 		stdout, err := cmd.StdoutPipe()
 		if err != nil {
-			log.Printf("SSE: Failed to create stdout pipe for Talos events: %v", err)
+			slog.Error("failed to create stdout pipe for talos events", "component", "sse", "error", err)
 			time.Sleep(10 * time.Second)
 			continue
 		}

 		if err := cmd.Start(); err != nil {
-			log.Printf("SSE: Failed to start Talos event watch: %v", err)
+			slog.Error("failed to start talos event watch", "component", "sse", "error", err)
 			time.Sleep(10 * time.Second)
 			continue
 		}
@@ -599,11 +599,11 @@ func (w *TalosWatcher) watchEvents() {
 			}
 		}

-		cmd.Wait()
+		_ = cmd.Wait()

 		// If context not cancelled, restart after a delay
 		if w.ctx.Err() == nil {
-			log.Printf("SSE: Restarting talos watcher for instance %s", w.instanceName)
+			slog.Info("restarting talos watcher", "component", "sse", "instance", w.instanceName)
 			time.Sleep(10 * time.Second)
 		}
 	}
@@ -612,5 +612,5 @@ func (w *TalosWatcher) watchEvents() {
 // Stop stops the watcher
 func (w *TalosWatcher) Stop() {
 	w.cancel()
-	log.Printf("SSE: Stopped talos watcher for instance %s", w.instanceName)
-}
+	slog.Info("stopped talos watcher", "component", "sse", "instance", w.instanceName)
+}
--- a/api/internal/sse/watchers_test.go
+++ b/api/internal/sse/watchers_test.go
@@ -431,4 +431,4 @@ func BenchmarkJSONParsing(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		watcher.parsePodEvent([]byte(podJSON), "test-instance")
 	}
-}
+}
--- a/api/internal/tools/kubectl_test.go
+++ b/api/internal/tools/kubectl_test.go
@@ -25,8 +25,7 @@ func TestNewKubectl(t *testing.T) {
 			k := NewKubectl(tt.kubeconfigPath)
 			if k == nil {
 				t.Fatal("NewKubectl() returned nil")
-			}
-			if k.kubeconfigPath != tt.kubeconfigPath {
+			} else if k.kubeconfigPath != tt.kubeconfigPath {
 				t.Errorf("kubeconfigPath = %q, want %q", k.kubeconfigPath, tt.kubeconfigPath)
 			}
 		})
@@ -209,9 +208,8 @@ func TestKubectlGetDeployment(t *testing.T) {
 			if err == nil {
 				if depInfo == nil {
 					t.Fatal("GetDeployment() returned nil without error")
-				}
-				// Desired should be non-negative
-				if depInfo.Desired < 0 {
+				} else if depInfo.Desired < 0 {
+					// Desired should be non-negative
 					t.Errorf("Desired = %d, should be non-negative", depInfo.Desired)
 				}
 			}
@@ -244,19 +242,19 @@ func TestKubectlGetReplicas(t *testing.T) {
 			if err == nil {
 				if replicaInfo == nil {
 					t.Fatal("GetReplicas() returned nil without error")
-				}
-				// All values should be non-negative
-				if replicaInfo.Desired < 0 {
-					t.Error("Desired < 0")
-				}
-				if replicaInfo.Current < 0 {
-					t.Error("Current < 0")
-				}
-				if replicaInfo.Ready < 0 {
-					t.Error("Ready < 0")
-				}
-				if replicaInfo.Available < 0 {
-					t.Error("Available < 0")
+				} else {
+					if replicaInfo.Desired < 0 {
+						t.Error("Desired < 0")
+					}
+					if replicaInfo.Current < 0 {
+						t.Error("Current < 0")
+					}
+					if replicaInfo.Ready < 0 {
+						t.Error("Ready < 0")
+					}
+					if replicaInfo.Available < 0 {
+						t.Error("Available < 0")
+					}
 				}
 			}
 		})
@@ -775,4 +773,3 @@ func TestKubectlGetPodsByLabel(t *testing.T) {
 		})
 	}
 }
-
--- a/api/internal/tools/talosctl.go
+++ b/api/internal/tools/talosctl.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"os/exec"
+	"regexp"
 	"runtime"
 	"strings"
 	"time"
@@ -385,6 +386,30 @@ func (t *Talosctl) Upgrade(nodeIP, image string, preserve bool) error {
 	return nil
 }

+// Reboot reboots a node. The node restarts without wiping state.
+func (t *Talosctl) Reboot(nodeIP string) error {
+	args := t.buildArgs([]string{
+		"reboot",
+		"--nodes", nodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		outputStr := string(output)
+		// Connection errors are expected — the node is rebooting
+		if strings.Contains(outputStr, "connection refused") || strings.Contains(outputStr, "Unavailable") || strings.Contains(outputStr, "EOF") {
+			return nil
+		}
+		return fmt.Errorf("talosctl reboot failed: %w\nOutput: %s", err, outputStr)
+	}
+
+	return nil
+}
+
 // Rollback reverts a node to its previous Talos version.
 // Talos uses an A/B image scheme, so rollback restores the previous boot image.
 func (t *Talosctl) Rollback(nodeIP string) error {
@@ -462,3 +487,172 @@ func GetClientInfo() (*ClientInfo, error) {
 		Arch:    arch,
 	}, nil
 }
+
+// ServiceStatus represents the health status of a Talos service
+type ServiceStatus struct {
+	ID            string `json:"id"`
+	State         string `json:"state"`
+	Healthy       bool   `json:"healthy"`
+	HealthMessage string `json:"healthMessage"`
+}
+
+// DmesgError represents a critical error found in kernel messages
+type DmesgError struct {
+	Severity  string `json:"severity"`
+	Message   string `json:"message"`
+	Timestamp string `json:"timestamp"`
+}
+
+// GetServices queries Talos service statuses from a node
+func (t *Talosctl) GetServices(nodeIP string) ([]ServiceStatus, error) {
+	args := t.buildArgs([]string{
+		"service",
+		"--nodes", nodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("talosctl service failed: %w\nOutput: %s", err, string(output))
+	}
+
+	return ParseServiceOutput(string(output)), nil
+}
+
+// ParseServiceOutput parses the tabular output of `talosctl service`
+func ParseServiceOutput(output string) []ServiceStatus {
+	var services []ServiceStatus
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		// Skip header and empty lines
+		if line == "" || strings.HasPrefix(line, "NODE") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		// Format: NODE SERVICE STATE HEALTH LAST_CHANGE LAST_CHANGE_UNIT LAST_EVENT...
+		// Minimum: node + service + state + health + last_change + unit = 6 fields
+		if len(fields) < 6 {
+			continue
+		}
+
+		id := fields[1]
+		state := fields[2]
+		health := fields[3]
+
+		// Extract last event (everything after the time fields)
+		// Fields 4 and 5 are "Xm Ys ago", event starts at field 6+
+		var healthMessage string
+		if len(fields) > 6 {
+			healthMessage = strings.Join(fields[6:], " ")
+		}
+
+		services = append(services, ServiceStatus{
+			ID:            id,
+			State:         state,
+			Healthy:       health == "OK",
+			HealthMessage: healthMessage,
+		})
+	}
+
+	return services
+}
+
+// GetDmesg retrieves kernel messages from a node
+func (t *Talosctl) GetDmesg(nodeIP string) (string, error) {
+	args := t.buildArgs([]string{
+		"dmesg",
+		"--nodes", nodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("talosctl dmesg failed: %w\nOutput: %s", err, string(output))
+	}
+
+	return string(output), nil
+}
+
+// dmesg error patterns for hardware issues
+var dmesgErrorPatterns = []string{
+	"I/O error",
+	"Medium Error",
+	"failed command:",
+	"auto reallocate failed",
+	"memory error",
+	"machine check",
+	"ECC error",
+}
+
+var ataErrorRegex = regexp.MustCompile(`(?i)ata\d+.*error`)
+
+// ParseDmesgErrors scans dmesg output for critical hardware errors
+func ParseDmesgErrors(raw string) []DmesgError {
+	var errors []DmesgError
+	seen := make(map[string]bool)
+
+	for _, line := range strings.Split(raw, "\n") {
+		if line == "" {
+			continue
+		}
+
+		matched := false
+		lower := strings.ToLower(line)
+		for _, pattern := range dmesgErrorPatterns {
+			if strings.Contains(lower, strings.ToLower(pattern)) {
+				matched = true
+				break
+			}
+		}
+		if !matched && ataErrorRegex.MatchString(line) {
+			matched = true
+		}
+		if !matched {
+			continue
+		}
+
+		// Extract timestamp: format is "IP: facility: level: [TIMESTAMP]: message"
+		timestamp, message := parseDmesgLine(line)
+
+		// Deduplicate identical messages
+		if seen[message] {
+			continue
+		}
+		seen[message] = true
+
+		errors = append(errors, DmesgError{
+			Severity:  "error",
+			Message:   message,
+			Timestamp: timestamp,
+		})
+	}
+
+	return errors
+}
+
+// parseDmesgLine extracts timestamp and message from a talosctl dmesg line
+// Format: "192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb..."
+func parseDmesgLine(line string) (timestamp, message string) {
+	// Find timestamp in brackets
+	start := strings.Index(line, "[")
+	end := strings.Index(line, "]:")
+	if start >= 0 && end > start {
+		timestamp = line[start+1 : end]
+		// Message is everything after "]: "
+		if end+2 < len(line) {
+			message = strings.TrimSpace(line[end+2:])
+		}
+	}
+	if message == "" {
+		message = line
+	}
+	return
+}
--- a/api/internal/tools/talosctl_test.go
+++ b/api/internal/tools/talosctl_test.go
@@ -11,8 +11,7 @@ func TestNewTalosctl(t *testing.T) {
 		tc := NewTalosctl()
 		if tc == nil {
 			t.Fatal("NewTalosctl() returned nil")
-		}
-		if tc.talosconfigPath != "" {
+		} else if tc.talosconfigPath != "" {
 			t.Error("talosconfigPath should be empty for NewTalosctl()")
 		}
 	})
@@ -22,8 +21,7 @@ func TestNewTalosctl(t *testing.T) {
 		tc := NewTalosconfigWithConfig(configPath)
 		if tc == nil {
 			t.Fatal("NewTalosconfigWithConfig() returned nil")
-		}
-		if tc.talosconfigPath != configPath {
+		} else if tc.talosconfigPath != configPath {
 			t.Errorf("talosconfigPath = %q, want %q", tc.talosconfigPath, configPath)
 		}
 	})
@@ -433,9 +431,9 @@ Server:
 			want: "v1.11.5",
 		},
 		{
-			name: "fallback to Talos line when no Tag present",
+			name:   "fallback to Talos line when no Tag present",
 			output: `Talos v1.12.0`,
-			want: "v1.12.0",
+			want:   "v1.12.0",
 		},
 	}

@@ -619,6 +617,183 @@ func TestGetClientInfo(t *testing.T) {
 	}
 }

+func TestParseServiceOutput(t *testing.T) {
+	tests := []struct {
+		name     string
+		output   string
+		wantLen  int
+		checkSvc func(t *testing.T, services []ServiceStatus)
+	}{
+		{
+			name: "healthy node",
+			output: `NODE           SERVICE      STATE     HEALTH   LAST CHANGE      LAST EVENT
+192.168.8.33   apid         Running   OK       172h15m25s ago   Health check successful
+192.168.8.33   etcd         Running   OK       172h14m56s ago   Health check successful
+192.168.8.33   kubelet      Running   OK       172h15m16s ago   Health check successful`,
+			wantLen: 3,
+			checkSvc: func(t *testing.T, services []ServiceStatus) {
+				for _, svc := range services {
+					if !svc.Healthy {
+						t.Errorf("service %s should be healthy", svc.ID)
+					}
+					if svc.State != "Running" {
+						t.Errorf("service %s state = %q, want Running", svc.ID, svc.State)
+					}
+				}
+			},
+		},
+		{
+			name: "unhealthy etcd",
+			output: `NODE           SERVICE      STATE     HEALTH   LAST CHANGE   LAST EVENT
+192.168.8.32   etcd         Running   Fail     42m14s ago    Health check failed: context deadline exceeded
+192.168.8.32   kubelet      Running   OK       37m42s ago    Health check successful`,
+			wantLen: 2,
+			checkSvc: func(t *testing.T, services []ServiceStatus) {
+				for _, svc := range services {
+					if svc.ID == "etcd" {
+						if svc.Healthy {
+							t.Error("etcd should be unhealthy")
+						}
+						if svc.HealthMessage != "Health check failed: context deadline exceeded" {
+							t.Errorf("etcd health message = %q", svc.HealthMessage)
+						}
+					}
+					if svc.ID == "kubelet" && !svc.Healthy {
+						t.Error("kubelet should be healthy")
+					}
+				}
+			},
+		},
+		{
+			name: "services with unknown health",
+			output: `NODE           SERVICE      STATE     HEALTH   LAST CHANGE      LAST EVENT
+192.168.8.32   dashboard    Running   ?        42m47s ago       Process Process(["/sbin/dashboard"]) started with PID 2237`,
+			wantLen: 1,
+			checkSvc: func(t *testing.T, services []ServiceStatus) {
+				if services[0].Healthy {
+					t.Error("service with ? health should not be marked healthy")
+				}
+			},
+		},
+		{
+			name:    "empty output",
+			output:  "",
+			wantLen: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			services := ParseServiceOutput(tt.output)
+			if len(services) != tt.wantLen {
+				t.Errorf("ParseServiceOutput() returned %d services, want %d", len(services), tt.wantLen)
+				return
+			}
+			if tt.checkSvc != nil {
+				tt.checkSvc(t, services)
+			}
+		})
+	}
+}
+
+func TestParseDmesgErrors(t *testing.T) {
+	tests := []struct {
+		name    string
+		input   string
+		wantLen int
+		check   func(t *testing.T, errors []DmesgError)
+	}{
+		{
+			name: "disk I/O errors",
+			input: `192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848
+192.168.8.32: kern:    info: [2026-05-25T07:12:06.040Z]: sd 1:0:0:0: [sdb] Sense Key : Medium Error [current]
+192.168.8.32: kern:     err: [2026-05-25T07:12:10.886Z]: ata1.00: failed command: READ FPDMA QUEUED
+192.168.8.32: kern:    info: [2026-05-25T07:12:14.072Z]: sd 1:0:0:0: Add. Sense: Unrecovered read error - auto reallocate failed`,
+			wantLen: 4,
+			check: func(t *testing.T, errors []DmesgError) {
+				if errors[0].Timestamp != "2026-05-25T07:12:06.034Z" {
+					t.Errorf("timestamp = %q", errors[0].Timestamp)
+				}
+				if errors[0].Severity != "error" {
+					t.Errorf("severity = %q, want error", errors[0].Severity)
+				}
+			},
+		},
+		{
+			name: "ata error pattern",
+			input: `192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: ata1.00: error: { UNC }`,
+			wantLen: 1,
+		},
+		{
+			name: "no errors in normal output",
+			input: `192.168.8.32: kern:    info: [2026-05-25T07:11:00.000Z]: Linux version 6.18.24-talos
+192.168.8.32: kern:    info: [2026-05-25T07:11:00.100Z]: Command line: init_on_alloc=1
+192.168.8.32: kern:    info: [2026-05-25T07:11:01.000Z]: sdb: sdb1 sdb2 sdb3 sdb4`,
+			wantLen: 0,
+		},
+		{
+			name:    "empty input",
+			input:   "",
+			wantLen: 0,
+		},
+		{
+			name: "deduplicates identical messages",
+			input: `192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848
+192.168.8.32: kern:     err: [2026-05-25T07:12:10.034Z]: I/O error, dev sdb, sector 4873848`,
+			wantLen: 1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			errors := ParseDmesgErrors(tt.input)
+			if len(errors) != tt.wantLen {
+				t.Errorf("ParseDmesgErrors() returned %d errors, want %d", len(errors), tt.wantLen)
+				for _, e := range errors {
+					t.Logf("  error: %s", e.Message)
+				}
+				return
+			}
+			if tt.check != nil {
+				tt.check(t, errors)
+			}
+		})
+	}
+}
+
+func TestParseDmesgLine(t *testing.T) {
+	tests := []struct {
+		name          string
+		line          string
+		wantTimestamp string
+		wantMessage   string
+	}{
+		{
+			name:          "standard talos dmesg format",
+			line:          "192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848",
+			wantTimestamp: "2026-05-25T07:12:06.034Z",
+			wantMessage:   "I/O error, dev sdb, sector 4873848",
+		},
+		{
+			name:        "line without brackets",
+			line:        "some plain log line",
+			wantMessage: "some plain log line",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ts, msg := parseDmesgLine(tt.line)
+			if ts != tt.wantTimestamp {
+				t.Errorf("timestamp = %q, want %q", ts, tt.wantTimestamp)
+			}
+			if msg != tt.wantMessage {
+				t.Errorf("message = %q, want %q", msg, tt.wantMessage)
+			}
+		})
+	}
+}
+
 // Helper function for interface filtering test
 func containsAny(s string, substrs []string) bool {
 	for _, substr := range substrs {
--- a/api/internal/tools/yq_test.go
+++ b/api/internal/tools/yq_test.go
@@ -12,8 +12,7 @@ func TestNewYQ(t *testing.T) {
 		yq := NewYQ()
 		if yq == nil {
 			t.Fatal("NewYQ() returned nil")
-		}
-		if yq.yqPath == "" {
+		} else if yq.yqPath == "" {
 			t.Error("yqPath should not be empty")
 		}
 	})
--- a/api/main.go
+++ b/api/main.go
@@ -2,7 +2,7 @@ package main

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/signal"
@@ -16,6 +16,8 @@ import (
 	v1 "github.com/wild-cloud/wild-central/daemon/internal/api/v1"
 	"github.com/wild-cloud/wild-central/daemon/internal/backup"
 	"github.com/wild-cloud/wild-central/daemon/internal/instance"
+	"github.com/wild-cloud/wild-central/daemon/internal/logging"
+	"github.com/wild-cloud/wild-central/daemon/internal/operations"
 )

 var startTime time.Time
@@ -33,6 +35,11 @@ func splitAndTrim(s string, sep string) []string {
 }

 func main() {
+	// Initialize structured logging
+	slog.SetDefault(slog.New(logging.NewConsoleHandler(os.Stderr, &slog.HandlerOptions{
+		Level: slog.LevelInfo,
+	})))
+
 	// Record start time
 	startTime = time.Now()

@@ -45,27 +52,31 @@ func main() {
 	// Get apps directory from environment or use default
 	appsDir := os.Getenv("WILD_DIRECTORY")
 	if appsDir == "" {
-		// Default apps directory
 		appsDir = "/opt/wild-cloud/apps"
-		log.Printf("WILD_DIRECTORY not set, using default apps directory: %s", appsDir)
-	} else {
-		// If WILD_DIRECTORY is set, use it as-is for backward compatibility
-		// (it might point to the old directory structure with apps/ subdirectory)
-		log.Printf("Using WILD_DIRECTORY for apps: %s", appsDir)
 	}
+	slog.Info("configured directories", "dataDir", dataDir, "appsDir", appsDir)

 	// Create API handler with all dependencies
 	api, err := v1.NewAPI(dataDir, appsDir)
 	if err != nil {
-		log.Fatalf("Failed to initialize API: %v", err)
+		slog.Error("failed to initialize API", "error", err)
+		os.Exit(1)
+	}
+
+	// Fail any operations left running from a previous API process
+	instanceMgr := instance.NewManager(dataDir)
+	opsMgr := operations.NewManager(dataDir)
+	if instances, err := instanceMgr.ListInstances(); err == nil {
+		for _, name := range instances {
+			if err := opsMgr.FailOrphaned(name); err != nil {
+				slog.Warn("failed to clean orphaned operations", "instance", name, "error", err)
+			}
+		}
 	}

 	// Start central status SSE broadcaster
 	api.StartCentralStatusBroadcaster(startTime)
-	log.Println("Central status broadcaster started")
-
-	// Start backup scheduler
-	instanceMgr := instance.NewManager(dataDir)
+	slog.Info("central status broadcaster started")
 	scheduler := backup.NewScheduler(dataDir, instanceMgr)
 	scheduler.Start()

@@ -89,9 +100,8 @@ func main() {
 	var allowedOrigins []string

 	if corsOrigins := os.Getenv("WILD_CORS_ORIGINS"); corsOrigins != "" {
-		// Use explicitly configured origins
 		allowedOrigins = splitAndTrim(corsOrigins, ",")
-		log.Printf("CORS configured with explicit origins: %v", allowedOrigins)
+		slog.Info("CORS configured with explicit origins", "origins", allowedOrigins)
 	} else {
 		// Auto-detect origins based on hostname
 		allowedOrigins = []string{
@@ -116,7 +126,7 @@ func main() {
 				fmt.Sprintf("http://%s:5173", hostname),
 				fmt.Sprintf("http://%s:5174", hostname),
 			)
-			log.Printf("Added hostname-based CORS origins for: %s", hostname)
+			slog.Info("added hostname-based CORS origins", "hostname", hostname)
 		}

 		// Add development server ports
@@ -129,7 +139,7 @@ func main() {
 			"http://127.0.0.1:3000",
 		)

-		log.Printf("CORS configured with auto-detected origins: %v", allowedOrigins)
+		slog.Info("CORS configured with auto-detected origins", "count", len(allowedOrigins))
 	}

 	corsHandler := cors.New(cors.Options{
@@ -163,9 +173,7 @@ func main() {
 	port := 5055

 	addr := fmt.Sprintf("%s:%d", host, port)
-	log.Printf("Starting wild-central daemon on %s", addr)
-	log.Printf("Data directory: %s", dataDir)
-	log.Printf("Apps directory: %s", appsDir)
+	slog.Info("daemon started", "addr", addr)

 	// Set up signal handling for graceful shutdown
 	sigChan := make(chan os.Signal, 1)
@@ -174,13 +182,14 @@ func main() {
 	// Start HTTP server in goroutine
 	go func() {
 		if err := http.ListenAndServe(addr, handler); err != nil {
-			log.Fatal("Server failed to start:", err)
+			slog.Error("server failed to start", "error", err)
+			os.Exit(1)
 		}
 	}()

 	// Wait for shutdown signal
 	<-sigChan
-	log.Println("Shutting down gracefully...")
+	slog.Info("shutting down")
 	scheduler.Stop()
-	log.Println("Shutdown complete")
+	slog.Info("shutdown complete")
 }
--- a/api/test/e2e/lib.sh
+++ b/api/test/e2e/lib.sh
@@ -160,6 +160,19 @@ api_put() {
  rm -f "$tmpfile"
 }

+# Makes a PATCH request. Sets HTTP_CODE and RESP globals.
+api_patch() {
+  local path="$1"
+  local body="$2"
+  local tmpfile
+  tmpfile=$(mktemp)
+  HTTP_CODE=$(curl -s -w '%{http_code}' -o "$tmpfile" \
+    -X PATCH -H "Content-Type: application/json" \
+    -d "$body" "${API_URL}${path}")
+  RESP=$(cat "$tmpfile")
+  rm -f "$tmpfile"
+}
+
 # Makes a DELETE request. Sets HTTP_CODE and RESP globals.
 api_delete() {
  local path="$1"
--- a/api/test/e2e/tests/05-config-and-drift.sh
+++ b/api/test/e2e/tests/05-config-and-drift.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Test: Config changes and drift detection
+# Verifies: PATCH config, compilation drift detected, compile clears drift, deploy succeeds
+# Idempotent: restores original config at end
+# Note: Uses db.name (not storage) because PVC storage can only expand, never shrink
+
+APP_CONFIG_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+APP_ENHANCED_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
+DB_INIT_FILE="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/db-init-job.yaml"
+
+# --- Read current config and capture original db.name ---
+
+test_start "Config: Read current config"
+api_get "$APP_CONFIG_PATH"
+assert_http "200" "GET app config should return 200"
+
+ORIGINAL_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+if [[ -z "$ORIGINAL_DB_NAME" ]]; then
+  ORIGINAL_DB_NAME="e2e_test_app"
+fi
+
+# --- PATCH config: change db.name ---
+
+test_start "Config: PATCH db.name to e2e_drift_test"
+api_patch "$APP_CONFIG_PATH" '{"config":{"db":{"name":"e2e_drift_test"}}}'
+assert_http "200" "PATCH config should return 200"
+
+test_start "Config: Verify config changed"
+api_get "$APP_CONFIG_PATH"
+NEW_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+assert_eq "$NEW_DB_NAME" "e2e_drift_test" "db.name should be e2e_drift_test after PATCH"
+
+# --- Check drift: config changed but not recompiled ---
+
+test_start "Config: Drift detected after config change"
+api_get "$APP_ENHANCED_PATH"
+COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
+assert_eq "$COMP_DRIFTED" "true" "Compilation drift should be detected"
+
+# --- Compile to clear compilation drift ---
+
+test_start "Config: Compile clears compilation drift"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+assert_http "200" "Compile should return 200"
+
+test_start "Config: Verify no compilation drift after compile"
+api_get "$APP_ENHANCED_PATH"
+COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
+assert_eq "$COMP_DRIFTED" "false" "Compilation drift should be cleared after compile"
+
+# --- Verify compiled db-init-job.yaml has new db name ---
+
+test_start "Config: Compiled db-init-job.yaml has e2e_drift_test"
+if grep -q "e2e_drift_test" "$DB_INIT_FILE" 2>/dev/null; then
+  test_pass
+else
+  test_fail "db-init-job.yaml should contain e2e_drift_test after compile"
+fi
+
+# --- Cleanup: restore original db.name, recompile, deploy ---
+
+echo "  Restoring original db.name (${ORIGINAL_DB_NAME})..."
+api_patch "$APP_CONFIG_PATH" "{\"config\":{\"db\":{\"name\":\"${ORIGINAL_DB_NAME}\"}}}"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "Config: Deploy with restored config"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed when restoring original config"
+fi
+
+test_start "Config: Pods ready after restore"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after config restore deploy"
+fi
--- a/api/test/e2e/tests/06-fetch-and-update.sh
+++ b/api/test/e2e/tests/06-fetch-and-update.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Test: Fetch from Wild Directory and redeploy
+# Verifies: fetch re-copies package from source, deploy succeeds after fetch
+# Idempotent: leaves app in same state as before
+
+MANIFEST_PATH="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/manifest.yaml"
+PACKAGE_DIR="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/.package"
+
+# --- Record current version ---
+
+test_start "Fetch: Record current version"
+CURRENT_VERSION=$(grep '^version:' "$MANIFEST_PATH" 2>/dev/null | head -1 | awk '{print $2}')
+assert_not_empty "$CURRENT_VERSION" "Should have a current version in manifest"
+
+# --- Fetch from Wild Directory ---
+
+test_start "Fetch: Re-fetch from source"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/fetch"
+assert_http "200" "Fetch should return 200"
+
+# --- Verify .package directory exists (source backup) ---
+
+test_start "Fetch: .package directory exists after fetch"
+if [[ -d "$PACKAGE_DIR" ]]; then
+  test_pass
+else
+  test_fail ".package directory should exist after fetch"
+fi
+
+# --- Verify manifest version still present ---
+
+test_start "Fetch: Version preserved after fetch"
+AFTER_VERSION=$(grep '^version:' "$MANIFEST_PATH" 2>/dev/null | head -1 | awk '{print $2}')
+assert_not_empty "$AFTER_VERSION" "Version should still be present after fetch"
+
+# --- Check source drift is cleared ---
+
+test_start "Fetch: No source drift after fetch"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
+SRC_DRIFTED=$(echo "$RESP" | jq -r '.drift.source.drifted // false' 2>/dev/null)
+assert_eq "$SRC_DRIFTED" "false" "Source drift should be false after fresh fetch"
+
+# --- Deploy after fetch ---
+
+test_start "Fetch: Deploy after fetch"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed after fetch"
+fi
+
+test_start "Fetch: Pods ready after deploy"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after fetch+deploy"
+fi
+
+# --- Verify status OK ---
+
+test_start "Fetch: Status OK after fetch+deploy"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+assert_http "200" "Status should return 200 after fetch+deploy"
--- a/api/test/e2e/tests/07-app-dependencies.sh
+++ b/api/test/e2e/tests/07-app-dependencies.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Test: App dependencies — add with explicit requiredAppMappings, verify resolution
+# Verifies: dependency mappings resolve correctly, secrets from deps are present
+# Idempotent: deletes and re-adds app, leaves it deployed
+
+MANIFEST_PATH="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/manifest.yaml"
+
+# --- Delete existing app to test fresh add with mappings ---
+
+echo "  Deleting ${APP_NAME} to test dependency add..."
+start_async_delete_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}" "$DELETE_TIMEOUT" || true
+
+# Wait for namespace to fully terminate
+WAIT=0
+while (( WAIT < 60 )); do
+  NS_STATUS=$($KC get namespace "$APP_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
+  if [[ "$NS_STATUS" == "NotFound" ]]; then
+    break
+  fi
+  sleep 5
+  WAIT=$((WAIT + 5))
+done
+
+# --- Add with explicit dependency mapping ---
+
+test_start "Deps: Add app with requiredAppMappings"
+api_post "/api/v1/instances/${INSTANCE}/apps" \
+  "{\"name\":\"${APP_NAME}\",\"requiredAppMappings\":{\"postgres\":\"postgres\"}}"
+assert_http_one_of "200 201" "Add with mappings should succeed"
+
+# --- Verify manifest has installedAs ---
+
+test_start "Deps: Manifest has installedAs for postgres"
+if grep -q "installedAs: postgres" "$MANIFEST_PATH" 2>/dev/null; then
+  test_pass
+else
+  test_fail "manifest.yaml should have installedAs: postgres"
+fi
+
+# --- Verify config has db.host referencing postgres ---
+
+test_start "Deps: Config has db.host"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+DB_HOST=$(echo "$RESP" | jq -r '.db.host // empty' 2>/dev/null)
+assert_not_empty "$DB_HOST" "db.host should be set from postgres dependency"
+
+# --- Deploy ---
+
+test_start "Deps: Deploy app with dependencies"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed with dependency mappings"
+fi
+
+test_start "Deps: Pods ready"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after dep deploy"
+fi
+
+# --- Verify K8s secret has postgres.password from dependency ---
+
+test_start "Deps: K8s secret has postgres.password key"
+SECRET_KEYS=$($KC get secret "${APP_NAME}-secrets" -n "$APP_NAME" -o jsonpath='{.data}' 2>/dev/null)
+if echo "$SECRET_KEYS" | grep -q "postgres.password" 2>/dev/null; then
+  test_pass
+else
+  test_fail "K8s secret should contain postgres.password from dependency"
+fi
--- a/api/test/e2e/tests/08-secrets-rotation.sh
+++ b/api/test/e2e/tests/08-secrets-rotation.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Test: Secrets rotation — change a secret, redeploy, verify in cluster
+# Verifies: PUT secrets, redeploy applies new secret to K8s
+# Idempotent: restores original secret at end
+
+SECRETS_PATH="/api/v1/instances/${INSTANCE}/secrets"
+
+# --- Read current secrets ---
+
+test_start "Secrets: Read raw secrets"
+api_get "${SECRETS_PATH}?raw=true"
+assert_http "200" "GET raw secrets should return 200"
+
+ORIGINAL_SECRETS="$RESP"
+ORIGINAL_PASSWORD=$(echo "$RESP" | jq -r '.apps."e2e-test-app".dbPassword // empty' 2>/dev/null)
+
+test_start "Secrets: Has dbPassword for e2e-test-app"
+assert_not_empty "$ORIGINAL_PASSWORD" "Should have a dbPassword for e2e-test-app"
+
+# --- Generate and set new password ---
+
+NEW_PASSWORD="e2e-rotated-$(date +%s)"
+
+test_start "Secrets: Rotate dbPassword"
+# Build modified secrets document with jq
+MODIFIED_SECRETS=$(echo "$ORIGINAL_SECRETS" | jq --arg pw "$NEW_PASSWORD" \
+  '.apps."e2e-test-app".dbPassword = $pw')
+api_put "${SECRETS_PATH}" "$MODIFIED_SECRETS"
+assert_http "200" "PUT secrets should return 200"
+
+# --- Verify secret stored ---
+
+test_start "Secrets: Verify new password stored"
+api_get "${SECRETS_PATH}?raw=true"
+STORED_PASSWORD=$(echo "$RESP" | jq -r '.apps."e2e-test-app".dbPassword // empty' 2>/dev/null)
+assert_eq "$STORED_PASSWORD" "$NEW_PASSWORD" "Stored password should match rotated value"
+
+# --- Compile and deploy to push new secret to cluster ---
+
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "Secrets: Deploy after rotation"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed after secret rotation"
+fi
+
+wait_for_pods "$APP_NAME" 120 || true
+
+# --- Verify K8s secret updated ---
+
+test_start "Secrets: K8s secret has rotated password"
+K8S_PASSWORD=$($KC get secret "${APP_NAME}-secrets" -n "$APP_NAME" \
+  -o jsonpath='{.data.dbPassword}' 2>/dev/null | base64 -d 2>/dev/null)
+assert_eq "$K8S_PASSWORD" "$NEW_PASSWORD" "K8s secret should have the rotated password"
+
+# --- Cleanup: restore original secrets ---
+
+echo "  Restoring original secrets..."
+api_put "${SECRETS_PATH}" "$ORIGINAL_SECRETS"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "Secrets: Deploy with restored secrets"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed when restoring original secrets"
+fi
+
+wait_for_pods "$APP_NAME" 120 || true
--- a/api/test/e2e/tests/09-dep-config-propagation.sh
+++ b/api/test/e2e/tests/09-dep-config-propagation.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Test: Dependency config propagation — change db.name, recompile, verify in manifests
+# Verifies: config change propagates to compiled templates after compile
+# Idempotent: restores original config at end
+
+APP_CONFIG_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+
+# --- Read current db.name ---
+
+test_start "DepConfig: Read current db.name"
+api_get "$APP_CONFIG_PATH"
+assert_http "200" "GET app config should return 200"
+
+ORIGINAL_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+if [[ -z "$ORIGINAL_DB_NAME" ]]; then
+  ORIGINAL_DB_NAME="e2e_test_app"
+fi
+
+# --- PATCH db.name to a new value ---
+
+test_start "DepConfig: PATCH db.name"
+api_patch "$APP_CONFIG_PATH" '{"config":{"db":{"name":"e2e_test_app_v2"}}}'
+assert_http "200" "PATCH db.name should return 200"
+
+test_start "DepConfig: Verify db.name changed"
+api_get "$APP_CONFIG_PATH"
+NEW_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+assert_eq "$NEW_DB_NAME" "e2e_test_app_v2" "db.name should be e2e_test_app_v2"
+
+# --- Compile ---
+
+test_start "DepConfig: Compile after config change"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+assert_http "200" "Compile should return 200"
+
+# --- Verify compiled db-init-job.yaml has new db name ---
+
+test_start "DepConfig: Compiled db-init-job.yaml has e2e_test_app_v2"
+DB_INIT_FILE="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/db-init-job.yaml"
+if grep -q "e2e_test_app_v2" "$DB_INIT_FILE" 2>/dev/null; then
+  test_pass
+else
+  test_fail "db-init-job.yaml should contain e2e_test_app_v2 after compile"
+fi
+
+# --- Check no compilation drift (we just compiled) ---
+
+test_start "DepConfig: No compilation drift after compile"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
+COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
+assert_eq "$COMP_DRIFTED" "false" "No compilation drift expected right after compile"
+
+# --- Cleanup: restore original db.name, recompile, deploy ---
+
+echo "  Restoring original db.name (${ORIGINAL_DB_NAME})..."
+api_patch "$APP_CONFIG_PATH" "{\"config\":{\"db\":{\"name\":\"${ORIGINAL_DB_NAME}\"}}}"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "DepConfig: Deploy with restored config"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed when restoring original config"
+fi
+
+wait_for_pods "$APP_NAME" 120 || true
--- a/api/test/e2e/tests/10-delete-and-readd.sh
+++ b/api/test/e2e/tests/10-delete-and-readd.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Test: Delete and re-add round trip
+# Verifies: full lifecycle — delete, verify gone, re-add, deploy, verify working
+# Idempotent: leaves app deployed for subsequent tests
+
+# --- Verify app is currently deployed ---
+
+test_start "DeleteReadd: App is deployed"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+assert_http "200" "App should be deployed before delete test"
+
+# --- Delete ---
+
+test_start "DeleteReadd: Delete app"
+if start_async_delete_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}" "$DELETE_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Delete failed"
+fi
+
+# --- Wait for namespace gone ---
+
+echo "  Waiting for namespace cleanup..."
+WAIT=0
+while (( WAIT < 60 )); do
+  NS_STATUS=$($KC get namespace "$APP_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
+  if [[ "$NS_STATUS" == "NotFound" ]]; then
+    break
+  fi
+  sleep 5
+  WAIT=$((WAIT + 5))
+done
+
+# --- Verify app is gone ---
+
+test_start "DeleteReadd: App gone after delete"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+if [[ "$HTTP_CODE" == "404" || "$HTTP_CODE" == "500" ]]; then
+  test_pass
+elif [[ "$HTTP_CODE" == "200" ]]; then
+  APP_STATUS=$(echo "$RESP" | jq -r '.status // empty' 2>/dev/null)
+  if [[ "$APP_STATUS" == "not-added" || "$APP_STATUS" == "not-deployed" ]]; then
+    test_pass
+  else
+    test_fail "App still appears as deployed after delete (status: ${APP_STATUS})"
+  fi
+else
+  test_fail "Unexpected HTTP ${HTTP_CODE}"
+fi
+
+# --- Re-add ---
+
+test_start "DeleteReadd: Re-add app"
+api_post "/api/v1/instances/${INSTANCE}/apps" "{\"name\":\"${APP_NAME}\"}"
+assert_http_one_of "200 201" "Re-add should succeed"
+
+# --- Verify config written ---
+
+test_start "DeleteReadd: Config exists after re-add"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+APP_NAMESPACE=$(echo "$RESP" | jq -r '.namespace // empty' 2>/dev/null)
+assert_eq "$APP_NAMESPACE" "e2e-test-app" "Config namespace should be set after re-add"
+
+# --- Deploy ---
+
+test_start "DeleteReadd: Deploy after re-add"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed after re-add"
+fi
+
+test_start "DeleteReadd: Pods ready after re-add deploy"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after re-add deploy"
+fi
+
+# --- Verify status OK ---
+
+test_start "DeleteReadd: Status OK after re-add"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+assert_http "200" "Status should return 200 after re-add"
--- a/api/test/e2e/tests/11-edge-cases.sh
+++ b/api/test/e2e/tests/11-edge-cases.sh
--- a/api/test/e2e/tests/12-cli.sh
+++ b/api/test/e2e/tests/12-cli.sh
--- a/api/test/e2e/tests/13-cleanup.sh
+++ b/api/test/e2e/tests/13-cleanup.sh
--- a/api/test/e2e/tests/14-upgrade.sh
+++ b/api/test/e2e/tests/14-upgrade.sh
--- a/cli/cmd/node.go
+++ b/cli/cmd/node.go
@@ -544,6 +544,110 @@ var nodeDeleteCmd = &cobra.Command{
 	},
 }

+var nodeHealthCmd = &cobra.Command{
+	Use:   "health <hostname>",
+	Short: "Check node health",
+	Long: `Check the health of a node by querying Talos service statuses
+and scanning kernel messages for hardware errors.
+
+Examples:
+  wild node health control-1
+  wild node health worker-2`,
+	Args: cobra.ExactArgs(1),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		inst, err := getInstanceName()
+		if err != nil {
+			return err
+		}
+
+		resp, err := apiClient.Get(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/health", inst, args[0]))
+		if err != nil {
+			return err
+		}
+
+		// Print human-readable output
+		healthy, _ := resp.Data["healthy"].(bool)
+		if healthy {
+			fmt.Printf("Node: %s — HEALTHY\n", args[0])
+		} else {
+			fmt.Printf("Node: %s — UNHEALTHY\n", args[0])
+		}
+
+		// Print services
+		if services, ok := resp.Data["services"].([]interface{}); ok && len(services) > 0 {
+			fmt.Println("\nServices:")
+			for _, s := range services {
+				svc, ok := s.(map[string]interface{})
+				if !ok {
+					continue
+				}
+				id, _ := svc["id"].(string)
+				state, _ := svc["state"].(string)
+				svcHealthy, _ := svc["healthy"].(bool)
+				msg, _ := svc["healthMessage"].(string)
+
+				status := "OK"
+				if !svcHealthy && msg != "" {
+					status = "FAIL"
+				} else if !svcHealthy {
+					status = "?"
+				}
+				if msg != "" {
+					fmt.Printf("  %-14s %-10s %-6s %s\n", id, state, status, msg)
+				} else {
+					fmt.Printf("  %-14s %-10s %s\n", id, state, status)
+				}
+			}
+		}
+
+		// Print dmesg errors
+		if errors, ok := resp.Data["dmesgErrors"].([]interface{}); ok && len(errors) > 0 {
+			fmt.Printf("\nDmesg Errors (%d):\n", len(errors))
+			for _, e := range errors {
+				entry, ok := e.(map[string]interface{})
+				if !ok {
+					continue
+				}
+				ts, _ := entry["timestamp"].(string)
+				msg, _ := entry["message"].(string)
+				if ts != "" {
+					fmt.Printf("  [%s] %s\n", ts, msg)
+				} else {
+					fmt.Printf("  %s\n", msg)
+				}
+			}
+		}
+
+		return nil
+	},
+}
+
+var nodeRebootCmd = &cobra.Command{
+	Use:   "reboot <hostname>",
+	Short: "Reboot a node",
+	Long: `Reboot a node without wiping state. The node will restart and
+rejoin the cluster automatically. Running workloads on this node will be interrupted.
+
+Examples:
+  wild node reboot control-1
+  wild node reboot worker-2`,
+	Args: cobra.ExactArgs(1),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		inst, err := getInstanceName()
+		if err != nil {
+			return err
+		}
+
+		_, err = apiClient.Post(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/reboot", inst, args[0]), nil)
+		if err != nil {
+			return err
+		}
+
+		fmt.Printf("Reboot initiated for %s\n", args[0])
+		return nil
+	},
+}
+
 var nodeUpgradeCmd = &cobra.Command{
 	Use:   "upgrade <hostname> <version>",
 	Short: "Upgrade a node to a new Talos version",
@@ -626,6 +730,8 @@ func init() {
 	nodeCmd.AddCommand(nodeUpdateCmd)
 	nodeCmd.AddCommand(nodeFetchTemplatesCmd)
 	nodeCmd.AddCommand(nodeDeleteCmd)
+	nodeCmd.AddCommand(nodeHealthCmd)
+	nodeCmd.AddCommand(nodeRebootCmd)
 	nodeCmd.AddCommand(nodeUpgradeCmd)
 	nodeCmd.AddCommand(nodeRollbackCmd)

--- a/docs/guides/cluster-networking-health.md
+++ b/docs/guides/cluster-networking-health.md
@@ -4,33 +4,101 @@ Verifying every item on this list confirms the full networking stack is function

 ## Node Layer

-1. **All nodes Ready** — no cordons, no taints (e.g. `maintenance:NoExecute`)
+1. **All nodes Ready** — no cordons, no taints (e.g., `maintenance:NoExecute`)
+   ```bash
+   kubectl get nodes
+   wild node list
+   ```
+
 2. **Flannel pods running on every node** — stale VXLAN tunnels break cross-node pod traffic
+   ```bash
+   kubectl get pods -n kube-system -l app=flannel -o wide
+   ```
+
 3. **Cross-node pod connectivity** — pods on each worker can reach pods on every other node

 ## Service Routing

 4. **kube-proxy pods running on every node** — nftables rules route ClusterIP traffic to pod endpoints
+   ```bash
+   kubectl get pods -n kube-system -l k8s-app=kube-proxy -o wide
+   ```
+
 5. **CoreDNS pods running and resolving** — both cluster-internal names (`*.svc.cluster.local`) and external names
+   ```bash
+   kubectl get pods -n kube-system -l k8s-app=kube-dns
+   ```
+
 6. **CoreDNS upstream reachability** — Talos DNS proxy at `169.254.116.108` responding from all nodes

 ## Load Balancing

 7. **MetalLB speakers running on all nodes** — L2 ARP announcements for LoadBalancer IPs
+   ```bash
+   kubectl get pods -n metallb-system -l component=speaker -o wide
+   ```
+
 8. **MetalLB ServiceL2Status resources valid** — `status.node` matches actual pod placement (stale entries block announcements)
+   ```bash
+   kubectl get servicel2statuses.metallb.io -n metallb-system
+   ```
+
 9. **LoadBalancer IPs reachable** — Traefik LB IP responds from LAN
+   ```bash
+   kubectl get svc -n traefik
+   curl -k https://<traefik-lb-ip>
+   ```

 ## Ingress & Security

 10. **Traefik ingress routing** — forwards to backend services, TLS termination working
+    ```bash
+    kubectl get pods -n traefik
+    kubectl logs -n traefik -l app=traefik | tail -20
+    ```
+
 11. **CrowdSec LAPI running** — can reach `api.crowdsec.net` (depends on CoreDNS external resolution)
+    ```bash
+    kubectl get pods -n crowdsec
+    ```
+
 12. **CrowdSec bouncer registered with LAPI** — unregistered bouncer blocks all forwardAuth requests
+    ```bash
+    wild service logs crowdsec | grep bouncer
+    ```

 ## Storage

 13. **Longhorn managers running on all workers** — enables volume replica scheduling and rebuilds
+    ```bash
+    kubectl get pods -n longhorn-system -l app=longhorn-manager -o wide
+    ```
+
 14. **Longhorn volume replicas healthy** — all volumes at target replica count across nodes
+    ```bash
+    kubectl get volumes.longhorn.io -n longhorn-system
+    ```
+
+## External DNS & Certificates
+
+15. **ExternalDNS pod running** — creating and updating DNS records at Cloudflare
+    ```bash
+    kubectl get pods -n externaldns
+    ```
+
+16. **cert-manager pods running** — issuing and renewing TLS certificates
+    ```bash
+    kubectl get pods -n cert-manager
+    kubectl get certificates -n cert-manager
+    ```

 ## LAN DNS

-15. **dnsmasq on Wild Central** — resolves LAN-local domains to correct LoadBalancer IPs (hairpin NAT)
+17. **dnsmasq on Wild Central** — resolves LAN-local domains to correct LoadBalancer IPs (hairpin NAT)
+    ```bash
+    wild dns status
+    ```
+
+## Quick Full Check
+
+Run `wild cluster health` for an automated check of the most critical items. For a comprehensive check, walk through each item above.
--- a/docs/guides/disaster-recovery.md
+++ b/docs/guides/disaster-recovery.md
@@ -0,0 +1,246 @@
+# Disaster Recovery
+
+This guide covers recovering a Wild Cloud cluster after catastrophic failure — hardware death, corrupted storage, or any scenario where you need to rebuild from scratch.
+
+## What You Need
+
+To rebuild a cluster you need two things:
+
+1. **Cluster config backup** — The tar.gz archive from Wild Cloud's cluster config backup feature, containing kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos node configs.
+2. **App backups** — The per-app backup archives (database dumps, PVC snapshots, config files) stored at your backup destination (S3, NFS, or local).
+
+If your instance data directory was a git repository (recommended), you also have the full history of compiled manifests and config.yaml in git. The git repo alone is enough to redeploy apps — but without secrets.yaml and kubeconfig, you can't authenticate to the cluster or decrypt app secrets.
+
+## Recovery Scenarios
+
+### Scenario 1: Wild Central Device Failure (Cluster Intact)
+
+The Raspberry Pi or server running Wild Central died, but the Kubernetes cluster nodes are still running.
+
+**Steps:**
+
+1. **Set up a new Wild Central device**:
+   ```bash
+   sudo dpkg -i wild-cloud-central_*.deb
+   sudo systemctl enable wild-cloud-central
+   ```
+
+2. **Restore your data directory** from git (for manifests and config) plus your cluster config backup (for secrets and credentials):
+   ```bash
+   # Clone instance data from git
+   git clone https://your-git-server/wild-cloud-data.git /var/lib/wild-central
+
+   # Extract cluster config backup over the top
+   # This restores kubeconfig, secrets.yaml, talosconfig, etc.
+   tar -xzf cluster-config-backup.tar.gz -C /var/lib/wild-central/instances/your-instance/
+   ```
+
+3. **Start Wild Central**:
+   ```bash
+   sudo systemctl start wild-cloud-central
+   ```
+
+4. **Verify connectivity**:
+   ```bash
+   wild instance use your-instance
+   wild cluster status
+   ```
+
+The cluster is still running — your apps are live. Wild Central is just the management plane.
+
+### Scenario 2: Single Node Failure (Cluster Degraded)
+
+One or more nodes died but the cluster still has quorum (at least 2 of 3 control plane nodes, or workers are replaceable).
+
+**Steps:**
+
+1. **Check cluster health** from Wild Central:
+   ```bash
+   talosctl --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     health --nodes <surviving-node-ip>
+   ```
+
+2. **Remove the dead node** from the cluster:
+   ```bash
+   # Remove from Kubernetes
+   kubectl --kubeconfig /var/lib/wild-central/instances/your-instance/kubeconfig \
+     delete node <dead-node-name>
+
+   # Remove from etcd (if control plane node)
+   talosctl --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     etcd remove-member <dead-node-name> --nodes <surviving-node-ip>
+   ```
+
+3. **PXE boot a replacement node** using Wild Central's PXE service, or manually install Talos Linux on the new hardware.
+
+4. **Add the new node** through the Wild Cloud web UI or CLI:
+   ```bash
+   wild node add --role worker --ip <new-node-ip>
+   ```
+
+5. **Verify workloads reschedule** to the new node:
+   ```bash
+   kubectl get pods --all-namespaces -o wide
+   ```
+
+### Scenario 3: Total Cluster Loss (Rebuild from Scratch)
+
+All nodes are gone. You need to rebuild everything.
+
+**Prerequisites:**
+- New hardware (or repaired existing hardware) with network boot capability or Talos Linux installed
+- Your cluster config backup (tar.gz with kubeconfig, talosconfig, secrets.yaml, Talos configs)
+- Access to your backup destination (S3 bucket, NFS share, etc.)
+- Your instance data git repo (if available — contains compiled manifests)
+
+**Steps:**
+
+1. **Set up Wild Central** on a fresh device:
+   ```bash
+   sudo dpkg -i wild-cloud-central_*.deb
+   ```
+
+2. **Restore your data directory**:
+   ```bash
+   # If you have a git repo:
+   git clone https://your-git-server/wild-cloud-data.git /var/lib/wild-central
+
+   # Extract cluster config over the top:
+   tar -xzf cluster-config-backup.tar.gz -C /var/lib/wild-central/instances/your-instance/
+   ```
+
+   If you don't have a git repo, just extract the cluster config backup into a fresh instance directory. You'll re-add apps from the Wild Directory.
+
+3. **Bootstrap new Talos nodes** using the restored Talos configs:
+   ```bash
+   # Apply control plane config to the first node
+   talosctl apply-config \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <node-ip> \
+     --file /var/lib/wild-central/instances/your-instance/talos/generated/controlplane.yaml \
+     --insecure
+   ```
+
+   The restored `controlplane.yaml` and `worker.yaml` contain your cluster's identity (cluster name, secrets, certificates). Using them ensures the new cluster has the same identity as the old one.
+
+4. **Bootstrap the cluster**:
+   ```bash
+   talosctl bootstrap \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <first-control-plane-ip>
+   ```
+
+5. **Wait for the cluster to be healthy**:
+   ```bash
+   talosctl health \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <first-control-plane-ip>
+   ```
+
+6. **Update kubeconfig** (the new cluster may issue a fresh kubeconfig):
+   ```bash
+   talosctl kubeconfig \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <first-control-plane-ip> \
+     /var/lib/wild-central/instances/your-instance/kubeconfig
+   ```
+
+7. **Deploy infrastructure services first** (order matters):
+   ```bash
+   wild instance use your-instance
+   wild service install metallb
+   wild service install traefik
+   wild service install cert-manager
+   wild service install external-dns
+   wild service install longhorn    # If using Longhorn for PVCs
+   ```
+
+8. **Deploy apps** (dependencies first, then apps):
+   ```bash
+   # Deploy database services first
+   wild app deploy pg
+   wild app deploy redis
+
+   # Then deploy apps
+   wild app deploy gitea
+   wild app deploy immich
+   # ... etc
+   ```
+
+   If your git repo has compiled manifests, these deploys apply the exact same manifests that were running before. If not, you'll need to re-add apps from the Wild Directory first:
+   ```bash
+   wild app add gitea
+   wild app deploy gitea
+   ```
+
+9. **Restore app data from backups**:
+   ```bash
+   # Restore each app's data (database + PVC) from the backup destination
+   # Use the Web UI: navigate to Backups > [app] > Restore
+   # Or via CLI:
+   wild restore gitea --auto
+   wild restore immich --auto
+   ```
+
+   The `--auto` flag runs the full blue-green restore cycle: restore to standby, switch traffic, then clean up the old namespace. For more control, run each phase separately — see [Restoring Backups](restoring-backups.md).
+
+10. **Verify everything is working**:
+    ```bash
+    wild app status gitea
+    wild app status immich
+    kubectl get pods --all-namespaces
+    ```
+
+## Cluster Config Backup
+
+The cluster config backup feature archives the files that are NOT tracked in git — the credentials and secrets needed to access the cluster.
+
+### What Gets Backed Up
+
+| File | Purpose |
+|------|---------|
+| `kubeconfig` | Kubernetes API credentials |
+| `config.yaml` | Full instance configuration |
+| `secrets.yaml` | App secrets (database passwords, API keys) |
+| `talos/generated/talosconfig` | Talos API credentials |
+| `talos/generated/controlplane.yaml` | Control plane node config |
+| `talos/generated/worker.yaml` | Worker node config |
+| `talos/generated/secrets.yaml` | Talos bootstrap secrets (cluster identity) |
+
+### Creating Cluster Config Backups
+
+**Web UI:** Navigate to Backups, click "Backup" on the "Cluster Config" row.
+
+**CLI:**
+```bash
+# Via API
+curl -X POST http://localhost:5055/api/v1/instances/your-instance/backup/cluster
+```
+
+**Scheduled:** Create a backup schedule with target type "cluster" to automatically back up cluster config on a recurring basis. See [Making Backups](making-backups.md) for scheduling details.
+
+### Downloading a Cluster Config Backup
+
+Cluster config backups are stored at your configured backup destination under the key `cluster-config/{instance}/{timestamp}.tar.gz`. To retrieve one:
+
+- **S3/Azure:** Download from the bucket/container using your cloud provider's CLI
+- **NFS:** Navigate to the NFS mount point and find the archive
+- **Local:** Find it at `{data-dir}/instances/{instance}/backups/cluster-config/...`
+
+Store a copy of the latest cluster config backup in a secure offsite location (encrypted USB drive, password manager, separate cloud storage). If your primary backup destination is on the cluster itself, a total cluster loss takes the backups with it.
+
+## Prevention Checklist
+
+- [ ] **Cluster config backups** are scheduled and running
+- [ ] **App backups** are scheduled for all critical apps
+- [ ] **Backup destination** is offsite or on separate infrastructure from the cluster
+- [ ] **Instance data directory** is pushed to a git remote (excludes secrets.yaml)
+- [ ] **Cluster config backup archive** is stored in a second location (not just on the cluster)
+- [ ] **Test a restore** periodically — backups are worthless if restore doesn't work
+
+## Related Guides
+
+- [Making Backups](making-backups.md) — Setting up backup destinations and schedules
+- [Restoring Backups](restoring-backups.md) — Blue-green restore process in detail
+- [Upgrade Talos](upgrade-talos.md) — Talos node upgrade and rollback
+- [Troubleshoot Cluster](troubleshoot-cluster.md) — Diagnosing cluster issues after recovery
--- a/docs/guides/making-backups.md
+++ b/docs/guides/making-backups.md
@@ -1,265 +1,250 @@
 # Making Backups

-This guide covers how to create backups of your wild-cloud infrastructure using the integrated backup system.
+This guide covers how to create backups of your Wild Cloud applications and cluster configuration.

 ## Overview

-The wild-cloud backup system creates encrypted, deduplicated snapshots using restic. It backs up three main components:
+Wild Cloud's backup system creates backups using native tools for each data type:

- **Applications**: Database dumps and persistent volume data
- **Cluster**: Kubernetes resources and etcd state
- **Configuration**: Wild-cloud repository and settings
+- **PostgreSQL databases**: `pg_dump` in custom compressed format
+- **MySQL databases**: `mysqldump` with gzip compression
+- **Persistent volumes**: Longhorn native backup API
+- **Configuration**: tar.gz archives of manifests, config, and secrets
+
+Backups are stored at a configured destination (S3, Azure Blob, NFS, or local filesystem) and tracked via recovery plans that coordinate the full backup-restore lifecycle.

 ## Prerequisites

 Before making backups, ensure you have:

-1. **Environment configured**: Run `source env.sh` to load backup configuration
-2. **Restic repository**: Backup repository configured in `config.yaml`
-3. **Backup password**: Set in wild-cloud secrets
-4. **Staging directory**: Configured path for temporary backup files
+1. **A backup destination configured** — S3 bucket, Azure container, NFS share, or local path
+2. **Longhorn backup target** configured if backing up persistent volumes
+3. **kubectl access** to your cluster

-## Backup Components
+## Configuring Backup Destination

-### Applications (`wild-app-backup`)
+### Web UI

-Backs up individual applications including:
- **Database dumps**: PostgreSQL/MySQL databases in compressed custom format
- **PVC data**: Application files streamed directly for restic deduplication
- **Auto-discovery**: Finds databases and PVCs based on app manifest.yaml
+Navigate to **Backups** and click **Settings** to configure your backup destination and retention policy.

-### Cluster Resources (`wild-backup --cluster-only`)
+### CLI

-Backs up cluster-wide resources:
- **Kubernetes resources**: All pods, services, deployments, secrets, configmaps
- **Storage definitions**: PersistentVolumes, PVCs, StorageClasses  
- **etcd snapshot**: Complete cluster state for disaster recovery
+Backup configuration is stored in your instance's `config.yaml` under the `backup:` section. Credentials are stored in `secrets.yaml`.

-### Configuration (`wild-backup --home-only`)
+Example configuration:

-Backs up wild-cloud configuration:
- **Repository contents**: All app definitions, manifests, configurations
- **Settings**: Wild-cloud configuration files and customizations
+```yaml
+# config.yaml
+backup:
+  destination:
+    type: "s3"  # "s3", "azure", "nfs", or "local"
+    s3:
+      bucket: "my-backups"
+      region: "us-east-1"
+      endpoint: "minio.example.com"  # Optional, for S3-compatible services
+  retention:
+    daily: 7
+    weekly: 4
+    monthly: 6
+    yearly: 1
+```
+
+```yaml
+# secrets.yaml
+backup:
+  s3:
+    accessKeyId: "..."
+    secretAccessKey: "..."
+```
+
+### Supported Destinations
+
+| Destination | Config Fields | Notes |
+|-------------|--------------|-------|
+| **local** | `path` | Default: `instances/{instance}/backups` |
+| **s3** | `bucket`, `region`, `endpoint`, `accessKeyId`, `secretAccessKey` | Supports S3-compatible services like MinIO |
+| **azure** | `container`, `storageAccount`, `accessKey` | Azure Blob Storage |
+| **nfs** | `server`, `path`, `mountPoint`, `mountOptions` | Auto-recovers stale mounts |

 ## Making Backups

-### Full System Backup (Recommended)
+### Single App Backup

-Create a complete backup of everything:
+**Web UI:** Navigate to **Backups > [app]** and click **Backup Now**.
+
+**CLI:**
+```bash
+# Backup a single app
+wild backup start gitea
+
+# Shorthand
+wild backup gitea
+```
+
+### All Apps Backup
+
+**CLI:**
+```bash
+# Backup all deployed apps
+wild backup all
+```
+
+### Cluster Config Backup
+
+Cluster config backups archive the credentials and secrets not tracked in git — kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos generated configs.
+
+**Web UI:** Navigate to **Backups** and click **Backup** on the Cluster Config row.
+
+**CLI / API:**
+```bash
+curl -X POST http://localhost:5055/api/v1/instances/{instance}/backup/cluster
+```
+
+## What Gets Backed Up
+
+### Application Backups
+
+The backup system auto-discovers what to back up based on each app's manifest:
+
+| Component | Tool | Format | Storage Key |
+|-----------|------|--------|-------------|
+| PostgreSQL database | `pg_dump` | Custom binary (compression level 9) + globals SQL | `postgres/{instance}/{app}/{timestamp}.dump` |
+| MySQL database | `mysqldump` | Gzip-compressed SQL | `mysql/{instance}/{app}/{timestamp}.sql.gz` |
+| Persistent volumes | Longhorn native API | Longhorn backup format | Stored in Longhorn backup target |
+| App config & manifests | tar + gzip | tar.gz archive | `config/{instance}/{app}/{timestamp}.tar.gz` |
+
+Cache volumes (names containing `-cache` or `-tmp`) and cache databases (Redis, Memcached) are automatically excluded.
+
+### Cluster Config Backups
+
+| File | Purpose |
+|------|---------|
+| `kubeconfig` | Kubernetes API credentials |
+| `config.yaml` | Full instance configuration |
+| `secrets.yaml` | App secrets (database passwords, API keys) |
+| `talos/generated/talosconfig` | Talos API credentials |
+| `talos/generated/controlplane.yaml` | Control plane node config |
+| `talos/generated/worker.yaml` | Worker node config |
+| `talos/generated/secrets.yaml` | Talos bootstrap secrets (cluster identity) |
+
+**Storage key:** `cluster-config/{instance}/{timestamp}.tar.gz`
+
+## Discovering Backup Resources
+
+Before backing up for the first time, you can discover what persistent data an app has:

 ```bash
-# Backup all components (apps + cluster + config)
-wild-backup
+wild backup discover gitea
 ```

-This is equivalent to:
+This analyzes the app's manifest and kustomize resources to find databases and PVCs, showing what will be backed up and what will be skipped.
+
+## Scheduled Backups
+
+### Creating a Schedule
+
+**Web UI:** Navigate to **Backups > [app]** and click **Schedule**.
+
+**CLI:**
 ```bash
-wild-backup --home --apps --cluster
+# Daily backup at 2 AM
+wild backup schedule create gitea --frequency daily --time 02:00
+
+# Weekly backup on Sunday at 3 AM
+wild backup schedule create gitea --frequency weekly --time 03:00 --day-of-week 0
+
+# Monthly backup on the 1st at midnight
+wild backup schedule create gitea --frequency monthly --time 00:00 --day-of-month 1
 ```

-### Selective Backups
-
-#### Applications Only
-```bash
-# All applications
-wild-backup --apps-only
-
-# Single application  
-wild-app-backup discourse
-
-# Multiple applications
-wild-app-backup discourse gitea immich
-```
-
-#### Cluster Only
-```bash
-# Kubernetes resources + etcd
-wild-backup --cluster-only
-```
-
-#### Configuration Only
-```bash  
-# Wild-cloud repository
-wild-backup --home-only
-```
-
-### Excluding Components
-
-Skip specific components:
+### Managing Schedules

 ```bash
-# Skip config, backup apps + cluster
-wild-backup --no-home
+# List all schedules
+wild backup schedule list

-# Skip applications, backup config + cluster  
-wild-backup --no-apps
+# Enable/disable a schedule
+wild backup schedule enable <schedule-id>
+wild backup schedule disable <schedule-id>

-# Skip cluster resources, backup config + apps
-wild-backup --no-cluster
+# Manually trigger a schedule
+wild backup schedule run <schedule-id>
+
+# Delete a schedule
+wild backup schedule delete <schedule-id>
 ```

-## Backup Process Details
+Retention is enforced automatically after each scheduled backup completes.

-### Application Backup Process
-
-1. **Discovery**: Parses `manifest.yaml` to find database and PVC dependencies
-2. **Database backup**: Creates compressed custom-format dumps
-3. **PVC backup**: Streams files directly to staging for restic deduplication  
-4. **Staging**: Organizes files in clean directory structure
-5. **Upload**: Creates individual restic snapshots per application
-
-### Cluster Backup Process
-
-1. **Resource export**: Exports all Kubernetes resources to YAML
-2. **etcd snapshot**: Creates point-in-time etcd backup via talosctl
-3. **Upload**: Creates single restic snapshot for cluster state
-
-### Restic Snapshots
-
-Each backup creates tagged restic snapshots:
+## Listing and Verifying Backups

 ```bash
-# View all snapshots
-restic snapshots
+# List backups for an app
+wild backup list gitea

-# Filter by component
-restic snapshots --tag discourse    # Specific app
-restic snapshots --tag cluster      # Cluster resources
-restic snapshots --tag wc-home      # Wild-cloud config
+# Verify a backup can be restored
+wild backup verify gitea
+
+# Verify a specific backup
+wild backup verify gitea 20250314T021530Z
 ```

-## Where Backup Files Are Staged
-
-Before uploading to your restic repository, backup files are organized in a staging directory. This temporary area lets you see exactly what's being backed up and helps with deduplication.
-
-Here's what the staging area looks like:
-
-```
-backup-staging/
-├── apps/
-│   ├── discourse/
-│   │   ├── database_20250816T120000Z.dump
-│   │   ├── globals_20250816T120000Z.sql  
-│   │   └── discourse/
-│   │       └── data/         # All the actual files
-│   ├── gitea/
-│   │   ├── database_20250816T120000Z.dump
-│   │   └── gitea-data/
-│   │       └── data/         # Git repositories, etc.
-│   └── immich/
-│       ├── database_20250816T120000Z.dump
-│       └── immich-data/
-│           └── upload/       # Photos and videos
-└── cluster/
-    ├── all-resources.yaml    # All running services
-    ├── secrets.yaml          # Passwords and certificates
-    ├── configmaps.yaml       # Configuration data
-    └── etcd-snapshot.db      # Complete cluster state
-```
-
-This staging approach means you can examine backup contents before they're uploaded, and restic can efficiently deduplicate files that haven't changed.
-
-## Advanced Usage
-
-### Custom Backup Scripts
-
-Applications can provide custom backup logic:
+## Deleting Backups

 ```bash
-# Create apps/myapp/backup.sh for custom behavior
-chmod +x apps/myapp/backup.sh
+# Delete a specific backup
+wild backup delete gitea 20250314T021530Z

-# wild-app-backup will use custom script if present
-wild-app-backup myapp
+# Skip confirmation
+wild backup delete gitea 20250314T021530Z --yes
 ```

-### Monitoring Backup Status
+## Backup Health

+Check the overall health of your backup system:
+
+**Web UI:** The **Backups** page shows a health summary across all apps — backup count, last backup time, scheduled status, and total size.
+
+**API:**
 ```bash
-# Check recent snapshots
-restic snapshots | head -20
-
-# Check specific app backups
-restic snapshots --tag discourse
-
-# Verify backup integrity
-restic check
+curl http://localhost:5055/api/v1/instances/{instance}/backup/health
 ```

-### Backup Automation
+## Recovery Plans

-Set up automated backups with cron:
+Each backup creates a recovery plan (`recovery-plan.yaml`) that tracks the backup's contents and coordinates restore operations. The plan records what strategies were used, where data is stored, and the current lifecycle status.

-```bash
-# Daily full backup at 2 AM
-0 2 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup
-
-# Hourly app backups during business hours  
-0 9-17 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup --apps-only
-```
-
-## Performance Considerations
-
-### Large PVCs (like Immich photos)
-
-The streaming backup approach provides:
- **First backup**: Full transfer time (all files processed)
- **Subsequent backups**: Only changed files processed (dramatically faster)
- **Storage efficiency**: Restic deduplication reduces storage usage
-
-### Network Usage
-
- **Database dumps**: Compressed at source, efficient transfer
- **PVC data**: Uncompressed transfer, but restic handles deduplication
- **etcd snapshots**: Small files, minimal impact
+Plan statuses progress through: `backing_up` -> `backed_up` -> (restore phases when used).

 ## Troubleshooting

-### Common Issues
+### "No databases or PVCs found"
+- The app has no database dependencies in its `manifest.yaml`
+- No PVCs with matching labels exist in the app namespace
+- Run `wild backup discover <app>` to see what's detected

-**"No databases or PVCs found"**
- App has no `manifest.yaml` with database dependencies
- No PVCs with matching labels in app namespace
- Create custom `backup.sh` script for special cases
+### Longhorn backup fails
+- Verify Longhorn backup target is configured (`kubectl get settings -n longhorn-system backup-target`)
+- Check Longhorn manager pods are running on all worker nodes
+- Ensure sufficient storage at the backup target

-**"kubectl not found"** 
- Ensure kubectl is installed and configured
- Check cluster connectivity with `kubectl get nodes`
+### Database dump fails
+- Verify the database pod is running: `kubectl get pods -n postgres`
+- Check that the database name in `config.yaml` matches the actual database

-**"Staging directory not set"**
- Configure `cloud.backup.staging` in `config.yaml`
- Ensure directory exists and is writable
-
-**"Could not create etcd backup"**
- Ensure `talosctl` is installed for Talos clusters
- Check control plane node connectivity
- Verify etcd pods are accessible in kube-system namespace
-
-### Backup Verification
-
-Always verify backups periodically:
-
-```bash
-# Check restic repository integrity
-restic check
-
-# List recent snapshots
-restic snapshots --compact
-
-# Test restore to different directory
-restic restore latest --target /tmp/restore-test
-```
+### Scheduled backups not running
+- Verify the schedule is enabled: `wild backup schedule list`
+- Check the Wild Central API is running: `wild daemon status`

 ## Security Notes

- **Encryption**: All backups are encrypted with your backup password
- **Secrets**: Kubernetes secrets are included in cluster backups
- **Access control**: Secure your backup repository and passwords
- **Network**: Consider bandwidth usage for large initial backups
+- **Encryption**: S3 and Azure destinations support server-side encryption. Configure bucket/container encryption policies at your cloud provider.
+- **Secrets**: Database credentials and API keys are included in cluster config backups. Store these backups securely.
+- **Access control**: Restrict access to your backup destination. Cluster config backups contain everything needed to access your cluster.

 ## Next Steps

- [Restoring Backups](restoring-backups.md) - Learn how to restore from backups
- Configure automated backup schedules
- Set up backup monitoring and alerting
- Test disaster recovery procedures
+- [Restoring Backups](restoring-backups.md) — Learn how to restore from backups using blue-green deployment
+- [Disaster Recovery](disaster-recovery.md) — Full cluster rebuild procedures
+- Set up scheduled backups for all critical apps
+- Store cluster config backups in a second location (not on the cluster itself)
--- a/docs/guides/monitoring.md
+++ b/docs/guides/monitoring.md
@@ -1,50 +1,209 @@
 # System Health Monitoring

-## Basic Monitoring
+This guide covers how to monitor the health of your Wild Cloud cluster, nodes, and applications.

-Check system health with:
+## Dashboard Overview
+
+The Wild Cloud web app dashboard provides an at-a-glance view of your cluster:
+
+- Cluster health status with individual health checks
+- Node count and status (control plane and worker)
+- Kubernetes and Talos versions
+- Running operations summary
+- Active app count
+
+Navigate to your instance's **Dashboard** page for this overview.
+
+## Cluster Health
+
+### Web UI
+
+The **Dashboard** page runs automated health checks covering:
+
+- Control plane readiness
+- Worker node readiness
+- etcd health
+- Networking health
+- Storage health
+
+Each check shows pass/fail status with detailed messages.
+
+### CLI
+
+```bash
+# Quick cluster health check
+wild cluster health
+
+# Cluster status overview
+wild cluster status
+
+# Check overall system health
+wild health
+```
+
+### API
+
+```bash
+# Detailed health checks
+curl http://localhost:5055/api/v1/instances/{instance}/cluster/health
+
+# Cluster status
+curl http://localhost:5055/api/v1/instances/{instance}/cluster/status
+```
+
+## Node Monitoring
+
+### Web UI
+
+The **Cluster** page shows all nodes with:
+
+- Status indicators (Ready, NotReady, maintenance)
+- Role (control plane / worker)
+- Hardware info (CPU, memory, storage)
+- Talos version
+- Current and target IP addresses
+
+### CLI
+
+```bash
+# List all nodes with status
+wild node list
+
+# Detailed node info
+wild node show <hostname>
+```
+
+### kubectl

 ```bash
 # Node resource usage
 kubectl top nodes

-# Pod resource usage
-kubectl top pods -A
-
-# Persistent volume claims
-kubectl get pvc -A
+# Node status and conditions
+kubectl get nodes -o wide
+kubectl describe node <node-name>
 ```

-## Advanced Monitoring (Future Implementation)
+## Application Monitoring

-Consider implementing:
+### Web UI

-1. **Prometheus + Grafana** for comprehensive monitoring:
-   ```bash
-   # Placeholder for future implementation
-   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
-   helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace
-   ```
+The **Apps > Installed** page shows all deployed apps with real-time status (running, unhealthy, no-pods, error). Click an app for detailed information including pod status, resource usage, and logs.

-2. **Loki** for log aggregation:
-   ```bash
-   # Placeholder for future implementation
-   helm repo add grafana https://grafana.github.io/helm-charts
-   helm install loki grafana/loki-stack --namespace logging --create-namespace
-   ```
+### CLI

-## Additional Resources
+```bash
+# List deployed apps with status
+wild app list-deployed

-This document will be expanded in the future with:
+# Detailed app status
+wild app status <app>

- Detailed backup and restore procedures
- Monitoring setup instructions
- Comprehensive security hardening guide
- Automated maintenance scripts
+# View app logs
+wild service logs <app> --follow
+```

-For now, refer to the following external resources:
+### kubectl

- [K3s Documentation](https://docs.k3s.io/)
- [Kubernetes Troubleshooting Guide](https://kubernetes.io/docs/tasks/debug/)
- [Velero Backup Documentation](https://velero.io/docs/latest/)
- [Kubernetes Security Best Practices](https://kubernetes.io/docs/concepts/security/)
+```bash
+# Pod resource usage across all namespaces
+kubectl top pods -A
+
+# Pods not in Running/Completed state
+kubectl get pods -A | grep -v "Running\|Completed"
+
+# Events for a specific app
+kubectl get events -n <app-namespace> --sort-by='.lastTimestamp'
+```
+
+## Backup Health
+
+### Web UI
+
+The **Backups** page shows a health summary across all apps:
+
+- Total backup count and size
+- Last backup time for each app
+- Whether scheduled backups are configured
+- Failed backup indicators
+
+### CLI / API
+
+```bash
+# Backup health overview
+curl http://localhost:5055/api/v1/instances/{instance}/backup/health
+```
+
+## Operations Monitoring
+
+Long-running operations (deployments, backups, restores, node upgrades) are tracked by the operations system.
+
+### Web UI
+
+The **Operations** page shows all operations with filtering by status (running, completed, failed) and real-time progress updates.
+
+### CLI
+
+```bash
+# List recent operations
+wild operation list
+
+# Check a specific operation
+wild operation get <operation-id>
+```
+
+## Storage Monitoring
+
+### Persistent Volumes
+
+```bash
+# Check PVC status and usage across all namespaces
+kubectl get pvc -A
+
+# Longhorn volume health
+kubectl get volumes.longhorn.io -n longhorn-system
+```
+
+### Longhorn Dashboard
+
+If Headlamp is installed, you can access the Kubernetes dashboard for detailed Longhorn volume information. Alternatively, access the Longhorn UI directly if its ingress is configured.
+
+## DNS Health
+
+```bash
+# Check dnsmasq status on Wild Central
+wild dns status
+
+# View current DNS configuration
+wild dns config
+
+# Test internal DNS resolution
+kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
+  nslookup kubernetes.default.svc.cluster.local
+```
+
+## Key Health Indicators
+
+| Component | Healthy Sign | Warning Sign |
+|-----------|-------------|--------------|
+| Nodes | All Ready, no taints | NotReady, cordoned, or tainted |
+| Pods | Running/Completed | CrashLoopBackOff, Pending, Evicted |
+| PVCs | Bound | Pending, Lost |
+| Longhorn volumes | Healthy, target replica count | Degraded, faulted, rebuilding |
+| Backups | Recent, scheduled | No recent backup, failed |
+| etcd | Healthy cluster members | Member unreachable, high latency |
+| MetalLB | All speakers running | Missing speakers, stale L2 status |
+
+## Setting Up Alerts
+
+Wild Cloud does not currently include a built-in alerting system. For production environments, consider:
+
+1. **Backup scheduling** with verification to catch backup failures early
+2. **Periodic health checks** via `wild cluster health` in a cron job
+3. **External monitoring** pointing at your app URLs for uptime checks
+
+## Next Steps
+
+- [Cluster Networking Health](cluster-networking-health.md) — Detailed networking health checklist
+- [Troubleshoot Cluster](troubleshoot-cluster.md) — When health checks fail
+- [Making Backups](making-backups.md) — Set up backup schedules
--- a/docs/guides/restoring-backups.md
+++ b/docs/guides/restoring-backups.md
@@ -1,294 +1,277 @@
 # Restoring Backups

-This guide will walk you through restoring your applications and cluster from wild-cloud backups. Hopefully you'll never need this, but when you do, it's critical that the process works smoothly.
+This guide covers how to restore applications from Wild Cloud backups. The restore system uses a blue-green deployment model — data is restored to a standby namespace so you can verify it before switching traffic.

-## Understanding Restore Types
+## Understanding the Blue-Green Restore

-Your wild-cloud backup system can restore different types of data depending on what you need to recover:
+Wild Cloud restores follow a three-phase process:

-**Application restores** bring back individual applications by restoring their database contents and file storage. This is what you'll use most often - maybe you accidentally deleted something in Discourse, or Gitea got corrupted, or you want to roll back Immich to before a bad update.
+```
+Restore → Switch → Cleanup
+```

-**Cluster restores** are for disaster recovery scenarios where you need to rebuild your entire Kubernetes cluster from scratch. This includes restoring all the cluster's configuration and even its internal state.
+1. **Restore**: Creates a standby namespace with restored data. Your active app keeps running untouched.
+2. **Switch**: Redirects traffic from the active deployment to the standby. The standby becomes the new active.
+3. **Cleanup**: Removes the previous active deployment and resources.

-**Configuration restores** bring back your wild-cloud repository and settings, which contain all the "recipes" for how your infrastructure should be set up.
+This means restores are safe — if something goes wrong, your active app is still running.

-## Before You Start Restoring
+## Before You Start

-Make sure you have everything needed to perform restores. You need to be in your wild-cloud directory with the environment loaded (`source env.sh`). Your backup repository and password should be configured and working - you can test this by running `restic snapshots` to see your available backups.
+Make sure you have:
+- kubectl access to your cluster
+- Backup destination accessible (same destination where backups were stored)
+- The app deployed (or at least added) to your instance

-Most importantly, make sure you have kubectl access to your cluster, since restores involve creating temporary pods and manipulating storage.
+List available backups first:
+
+```bash
+wild backup list gitea
+```

 ## Restoring Applications

-### Basic Application Restore
+### Full Restore (Automatic)

-The most common restore scenario is bringing back a single application. To restore the latest backup of an app:
+The simplest approach runs all three phases automatically:

 ```bash
-wild-app-restore discourse
+wild restore gitea --auto
 ```

-This restores both the database and all file storage for the discourse app. The restore system automatically figures out what the app needs based on its manifest file and what was backed up.
+This restores the latest backup, switches traffic, and cleans up the old deployment in one operation.

-If you want to restore from a specific backup instead of the latest:
+### Full Restore from Specific Backup

 ```bash
-wild-app-restore discourse abc123
+wild restore gitea 20250314T021530Z --auto
 ```

-Where `abc123` is the snapshot ID from `restic snapshots --tag discourse`.
+### Step-by-Step Restore (Recommended for Critical Apps)

-### Partial Restores
+For production apps, run each phase separately so you can verify between steps:

-Sometimes you only need to restore part of an application. Maybe the database is fine but the files got corrupted, or vice versa.
-
-To restore only the database:
-```bash
-wild-app-restore discourse --db-only
-```
-
-To restore only the file storage:
-```bash
-wild-app-restore discourse --pvc-only
-```
-
-To restore without database roles and permissions (if they're causing conflicts):
-```bash
-wild-app-restore discourse --skip-globals
-```
-
-### Finding Available Backups
-
-To see what backups are available for an app:
-```bash
-wild-app-restore discourse --list
-```
-
-This shows recent snapshots with their IDs, timestamps, and what was included.
-
-## How Application Restores Work
-
-Understanding what happens during a restore can help when things don't go as expected.
-
-### Database Restoration
-
-When restoring a database, the system first downloads the backup files from your restic repository. It then prepares the database by creating any needed roles, disconnecting existing users, and dropping/recreating the database to ensure a clean restore.
-
-For PostgreSQL databases, it uses `pg_restore` with parallel processing to speed up large database imports. For MySQL, it uses standard mysql import commands. The system also handles database ownership and permissions automatically.
-
-### File Storage Restoration
-
-File storage (PVC) restoration is more complex because it involves safely replacing files that might be actively used by running applications.
-
-First, the system creates a safety snapshot using Longhorn. This means if something goes wrong during the restore, you can get back to where you started. Then it scales your application down to zero replicas so no pods are using the storage.
-
-Next, it creates a temporary utility pod with the PVC mounted and copies all the backup files into place, preserving file permissions and structure. Once the data is restored and verified, it removes the utility pod and scales your application back up.
-
-If everything worked correctly, the safety snapshot is automatically deleted. If something went wrong, the safety snapshot is preserved so you can recover manually.
-
-## Cluster Disaster Recovery
-
-Cluster restoration is much less common but critical when you need to rebuild your entire infrastructure.
-
-### Restoring Kubernetes Resources
-
-To restore all cluster resources from a backup:
+**Step 1: Restore to standby**

 ```bash
-# Download cluster backup
-restic restore --tag cluster latest --target ./restore/
-
-# Apply all resources
-kubectl apply -f restore/cluster/all-resources.yaml
+wild restore gitea
 ```

-You can also restore specific types of resources:
-```bash
-kubectl apply -f restore/cluster/secrets.yaml
-kubectl apply -f restore/cluster/configmaps.yaml
-```
+This creates a standby namespace (e.g., `gitea-green`) with the restored database and files. Your active app continues running in its current namespace.

-### Restoring etcd State
+**Step 2: Verify the standby**

-**Warning: This is extremely dangerous and will affect your entire cluster.**
-
-etcd restoration should only be done when rebuilding a cluster from scratch. For Talos clusters:
+Check that the restored app is working:

 ```bash
-talosctl --nodes <control-plane-ip> etcd restore --from ./restore/cluster/etcd-snapshot.db
+# Check pods in the standby namespace
+kubectl get pods -n gitea-green
+
+# Check logs
+kubectl logs -n gitea-green deploy/gitea
+
+# View the recovery plan
+wild restore plan gitea
 ```

-This command stops etcd, replaces its data with the backup, and restarts the cluster. Expect significant downtime while the cluster rebuilds itself.
+**Step 3: Switch traffic**

-## Common Disaster Recovery Scenarios
+```bash
+wild restore switch gitea
+```
+
+This updates the active deployment color in config.yaml and redirects traffic to the standby namespace.
+
+**Step 4: Clean up**
+
+```bash
+wild restore cleanup gitea
+```
+
+This removes the previous active namespace and resources.
+
+### Web UI
+
+Navigate to **Backups > [app]**, select a backup, and click **Restore**. The UI tracks recovery plan progress through each phase.
+
+## Partial Restores
+
+Restore only specific components:
+
+```bash
+# Database only
+wild restore gitea --components postgres
+
+# Persistent volumes only
+wild restore gitea --components pvc
+
+# Config/manifests only (skip data)
+wild restore gitea --skip-data
+
+# Multiple specific components
+wild restore gitea --components postgres,pvc
+```
+
+## How Each Component Is Restored
+
+### PostgreSQL Databases
+
+The restore creates a standby database named `{dbName}_{standbyColor}` (e.g., `gitea_green`):
+
+1. Downloads the `.dump` file from the backup destination
+2. Creates the standby database and user
+3. Runs `pg_restore` with the dump file
+4. Deploys the app to the standby namespace with kustomize patches that rewrite database connection strings to point to the standby database
+
+### MySQL Databases
+
+Similar to PostgreSQL — creates a standby database, imports the gzip-compressed SQL dump, and patches connection strings.
+
+### Persistent Volumes (Longhorn)
+
+1. Triggers a Longhorn restore from the native backup, creating new volumes with standby naming
+2. Generates kustomize patches that bind standby PVCs to the restored volumes via `spec.volumeName`
+3. Cache/temp volumes (names containing `-cache` or `-tmp`) are skipped
+
+### Configuration
+
+Extracts the tar.gz archive containing manifests, kustomization, and app-specific config/secrets to the standby app directory.
+
+## Viewing Recovery Plans
+
+Each restore operation creates a recovery plan that tracks progress across all phases:
+
+```bash
+wild restore plan gitea
+```
+
+The plan shows:
+- Current status (restoring, restored, switching, switched, cleaning_up, cleaned_up, or failed)
+- Which strategies ran (postgres, longhorn-native, config)
+- Per-strategy status and details
+- Timestamps for each phase
+
+## Common Restore Scenarios
+
+### Rolling Back After a Bad Update
+
+```bash
+# List available backups
+wild backup list gitea
+
+# Restore from before the problematic update
+wild restore gitea 20250310T020000Z --auto
+```

 ### Complete Application Loss

-When an entire application is gone (namespace deleted, pods corrupted, etc.):
+If an app's namespace was deleted or corrupted:

 ```bash
-# Make sure the namespace exists
-kubectl create namespace discourse --dry-run=client -o yaml | kubectl apply -f -
+# Make sure the app is added to the instance
+wild app add gitea

-# Apply the application manifests if needed
-kubectl apply -f apps/discourse/
+# Deploy the app (creates namespace and base resources)
+wild app deploy gitea

-# Restore the application data
-wild-app-restore discourse
+# Restore data from backup
+wild restore gitea --auto
 ```

-### Complete Cluster Rebuild
+### Database-Only Recovery

-When rebuilding a cluster from scratch:
+If the app is running but the database is corrupted:

-First, build your new cluster infrastructure and install wild-cloud components. Then configure backup access so you can reach your backup repository.
-
-Restore cluster state:
 ```bash
-restic restore --tag cluster latest --target ./restore/
-# Apply etcd snapshot using appropriate method for your cluster type
+# Restore only the database to standby
+wild restore gitea --components postgres
+
+# Verify the restored database
+kubectl exec -n postgres deploy/postgres -- \
+  psql -U postgres -d gitea_green -c "SELECT count(*) FROM repository;"
+
+# Switch to the restored database
+wild restore switch gitea
+
+# Clean up
+wild restore cleanup gitea
 ```

-Finally, restore all applications:
-```bash
-# See what applications are backed up
-wild-app-restore --list
+### Cross-Cluster Migration

-# Restore each application individually
-wild-app-restore discourse
-wild-app-restore gitea
-wild-app-restore immich
+On the source cluster:
+```bash
+wild backup gitea
 ```

-### Rolling Back After Bad Changes
-
-Sometimes you need to undo recent changes to an application:
-
+On the target cluster:
 ```bash
-# See available snapshots
-wild-app-restore discourse --list
-
-# Restore from before the problematic changes
-wild-app-restore discourse abc123
-```
-
-## Cross-Cluster Migration
-
-You can use backups to move applications between clusters:
-
-On the source cluster, create a fresh backup:
-```bash
-wild-app-backup discourse
-```
-
-On the target cluster, deploy the application manifests:
-```bash
-kubectl apply -f apps/discourse/
-```
-
-Then restore the data:
-```bash
-wild-app-restore discourse
-```
-
-## Verifying Successful Restores
-
-After any restore, verify that everything is working correctly.
-
-For databases, check that you can connect and see expected data:
-```bash
-kubectl exec -n postgres deploy/postgres-deployment -- \
-  psql -U postgres -d discourse -c "SELECT count(*) FROM posts;"
-```
-
-For file storage, check that files exist and applications can start:
-```bash
-kubectl get pods -n discourse
-kubectl logs -n discourse deployment/discourse
-```
-
-For web applications, test that you can access them:
-```bash
-curl -f https://discourse.example.com/latest.json
+wild app add gitea
+wild app deploy gitea
+wild restore gitea --auto
 ```

 ## When Things Go Wrong

-### No Snapshots Found
+### Restore Fails Mid-Way

-If the restore system can't find backups for an application, check that snapshots exist:
-```bash
-restic snapshots --tag discourse
-```
+If the restore phase fails, your active app is untouched. The standby namespace may contain partial data. You can:
+- Fix the issue and retry: `wild restore gitea`
+- Check what went wrong: `wild restore plan gitea`
+- Clean up the failed standby manually: `kubectl delete namespace gitea-green`

-Make sure you're using the correct app name and that backups were actually created successfully.
+### Switch Fails

-### Database Restore Failures
+If the switch phase fails, the standby is fully populated and ready. You can:
+- Retry the switch: `wild restore switch gitea`
+- Inspect both namespaces and manually update config if needed

-Database restores can fail if the target database isn't accessible or if there are permission issues. Check that your postgres or mysql pods are running and that you can connect to them manually.
+### App Won't Start After Restore

-Review the restore error messages carefully - they usually indicate whether the problem is with the backup file, database connectivity, or permissions.
-
-### PVC Restore Failures
-
-If PVC restoration fails, check that you have sufficient disk space and that the PVC isn't being used by other pods. The error messages will usually indicate what went wrong.
-
-Most importantly, remember that safety snapshots are preserved when PVC restores fail. You can see them with:
-```bash
-kubectl get snapshot.longhorn.io -n longhorn-system -l app=wild-app-restore
-```
-
-These snapshots let you recover to the pre-restore state if needed.
-
-### Application Won't Start After Restore
-
-If pods fail to start after restoration, check file permissions and ownership. Sometimes the restoration process doesn't perfectly preserve the exact permissions that the application expects.
-
-You can also try scaling the application to zero and back to one, which sometimes resolves transient issues:
-```bash
-kubectl scale deployment/discourse -n discourse --replicas=0
-kubectl scale deployment/discourse -n discourse --replicas=1
-```
-
-## Manual Recovery
-
-When automated restore fails, you can always fall back to manual extraction and restoration:
+Check file permissions and ownership in the restored PVCs. Try scaling to zero and back:

 ```bash
-# Extract backup files to local directory
-restic restore --tag discourse latest --target ./manual-restore/
-
-# Manually copy database dump to postgres pod
-kubectl cp ./manual-restore/discourse/database_*.dump \
-  postgres/postgres-deployment-xxx:/tmp/
-
-# Manually restore database
-kubectl exec -n postgres deploy/postgres-deployment -- \
-  pg_restore -U postgres -d discourse /tmp/database_*.dump
+kubectl scale deployment/gitea -n gitea-green --replicas=0
+kubectl scale deployment/gitea -n gitea-green --replicas=1
 ```

-For file restoration, you'd need to create a utility pod and manually copy files into the PVC.
+### No Backups Found
+
+```bash
+# List all backups for the app
+wild backup list gitea
+
+# Check backup destination is configured
+wild config get backup.destination
+```
+
+## Verifying a Successful Restore
+
+After any restore, verify:
+
+```bash
+# Check pods are running
+kubectl get pods -n gitea
+
+# Check logs for errors
+kubectl logs -n gitea deploy/gitea
+
+# Test database connectivity
+kubectl exec -n postgres deploy/postgres -- \
+  psql -U postgres -d gitea -c "SELECT 1;"
+
+# Test web access
+curl -f https://gitea.example.com/
+```

 ## Best Practices

-Test your restore procedures regularly in a non-production environment. It's much better to discover issues with your backup system during a planned test than during an actual emergency.
+- **Test restores regularly** in a test environment. Backups are worthless if restore doesn't work.
+- **Use step-by-step restore** for production apps so you can verify before switching traffic.
+- **Monitor after restore** — watch the app more closely than usual for a few days.
+- **Communicate with users** before performing restores that involve downtime.

-Always communicate with users before performing restores, especially if they involve downtime. Document any manual steps you had to take so you can improve the automated process.
+## Next Steps

-After any significant restore, monitor your applications more closely than usual for a few days. Sometimes problems don't surface immediately.
-
-## Security and Access Control
-
-Restore operations are powerful and can be destructive. Make sure only trusted administrators can perform restores, and consider requiring approval or coordination before major restoration operations.
-
-Be aware that cluster restores include all secrets, so they potentially expose passwords, API keys, and certificates. Ensure your backup repository is properly secured.
-
-Remember that Longhorn safety snapshots are preserved when things go wrong. These snapshots may contain sensitive data, so clean them up appropriately once you've resolved any issues.
-
-## What's Next
-
-The best way to get comfortable with restore operations is to practice them in a safe environment. Set up a test cluster and practice restoring applications and data.
-
-Consider creating runbooks for your most likely disaster scenarios, including the specific commands and verification steps for your infrastructure.
-
-Read the [Making Backups](making-backups.md) guide to ensure you're creating the backups you'll need for successful recovery.
+- [Making Backups](making-backups.md) — Set up backup schedules and destinations
+- [Disaster Recovery](disaster-recovery.md) — Full cluster rebuild from backups
--- a/docs/guides/troubleshoot-cluster.md
+++ b/docs/guides/troubleshoot-cluster.md
@@ -1,19 +1,136 @@
-# Troubleshoot Wild Cloud Cluster issues
+# Troubleshoot Wild Cloud Cluster Issues
+
+## Quick Health Check
+
+```bash
+# Wild Cloud cluster health (runs multiple checks)
+wild cluster health
+
+# Cluster status overview
+wild cluster status
+```
+
+The web app **Dashboard** also shows health check results with pass/fail details.

 ## General Troubleshooting Steps

-1. **Check Node Status**:
-   ```bash
-   kubectl get nodes
-   kubectl describe node <node-name>
-   ```
+### 1. Check Node Status

-1. **Check Component Status**:
-   ```bash
-   # Check all pods across all namespaces
-   kubectl get pods -A
-   
-   # Look for pods that aren't Running or Ready
-   kubectl get pods -A | grep -v "Running\|Completed"
-   ```
+```bash
+# Kubernetes node status
+kubectl get nodes -o wide

+# Detailed node info (look for conditions, taints, capacity)
+kubectl describe node <node-name>
+
+# Talos node health (from Wild Central)
+talosctl --talosconfig <talosconfig-path> health --nodes <node-ip>
+```
+
+### 2. Check Pod Status
+
+```bash
+# All pods across all namespaces
+kubectl get pods -A
+
+# Pods that aren't Running or Completed
+kubectl get pods -A | grep -v "Running\|Completed"
+
+# Recent events (often reveals scheduling or resource issues)
+kubectl get events -A --sort-by='.lastTimestamp' | head -30
+```
+
+### 3. Check Control Plane Components
+
+On Talos clusters, control plane components run as static pods:
+
+```bash
+# Check control plane pods
+kubectl get pods -n kube-system
+
+# Check etcd health
+talosctl --talosconfig <talosconfig-path> etcd status --nodes <control-plane-ip>
+
+# Check Talos services
+talosctl --talosconfig <talosconfig-path> services --nodes <node-ip>
+```
+
+### 4. Check Resource Pressure
+
+```bash
+# Node resource usage
+kubectl top nodes
+
+# Pod resource usage
+kubectl top pods -A --sort-by=memory
+```
+
+## Common Issues
+
+### Node Not Ready
+
+```bash
+# Check node conditions
+kubectl describe node <node-name> | grep -A5 "Conditions:"
+
+# Check Talos logs for the node
+talosctl --talosconfig <talosconfig-path> logs kubelet --nodes <node-ip> | tail -50
+```
+
+Common causes: network connectivity loss, disk pressure, memory pressure, kubelet crash.
+
+### Pods Stuck in Pending
+
+```bash
+# Check why the pod can't be scheduled
+kubectl describe pod <pod-name> -n <namespace>
+```
+
+Common causes: insufficient resources, node affinity/taint mismatch, PVC not bound.
+
+### Pods in CrashLoopBackOff
+
+```bash
+# Check container logs
+kubectl logs <pod-name> -n <namespace> --previous
+
+# Check events for the pod
+kubectl describe pod <pod-name> -n <namespace>
+```
+
+Common causes: missing config/secrets, database not reachable, permission errors.
+
+### etcd Issues
+
+```bash
+# Check etcd members
+talosctl --talosconfig <talosconfig-path> etcd members --nodes <control-plane-ip>
+
+# Check etcd health
+talosctl --talosconfig <talosconfig-path> etcd status --nodes <control-plane-ip>
+
+# If etcd has a stale member (node was replaced)
+talosctl --talosconfig <talosconfig-path> etcd remove-member <stale-node-name> --nodes <healthy-node-ip>
+```
+
+### Lost Connectivity to Cluster
+
+If `kubectl` and `talosctl` can't reach the cluster:
+
+```bash
+# Check if the VIP is responding
+ping <control-plane-vip>
+
+# Try reaching individual node IPs directly
+talosctl --talosconfig <talosconfig-path> version --nodes <node-ip>
+
+# Regenerate kubeconfig if needed
+wild cluster kubeconfig --generate
+```
+
+## Related Guides
+
+- [Cluster Networking Health](cluster-networking-health.md) — Full networking stack checklist
+- [Troubleshoot DNS](troubleshoot-dns.md) — DNS resolution issues
+- [Troubleshoot Service Connectivity](troubleshoot-service-connectivity.md) — Inter-service communication
+- [Disaster Recovery](disaster-recovery.md) — Rebuilding from scratch
--- a/docs/guides/troubleshoot-dns.md
+++ b/docs/guides/troubleshoot-dns.md
@@ -1,20 +1,98 @@
 # Troubleshoot DNS

-If DNS resolution isn't working properly:
+Wild Cloud uses two DNS layers: **CoreDNS** inside the cluster for service discovery, and **dnsmasq** on Wild Central for LAN-local domain resolution.

-1. Check CoreDNS status:
-   ```bash
-   kubectl get pods -n kube-system -l k8s-app=kube-dns
-   kubectl logs -l k8s-app=kube-dns -n kube-system
-   ```
+## Cluster DNS (CoreDNS)

-2. Verify CoreDNS configuration:
-   ```bash
-   kubectl get configmap -n kube-system coredns -o yaml
-   ```
+If pods can't resolve service names or external domains:

-3. Test DNS resolution from inside the cluster:
-   ```bash
-   kubectl run -i --tty --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default
-   ```
+### 1. Check CoreDNS Status

+```bash
+kubectl get pods -n kube-system -l k8s-app=kube-dns
+kubectl logs -l k8s-app=kube-dns -n kube-system
+```
+
+### 2. Verify CoreDNS Configuration
+
+```bash
+kubectl get configmap -n kube-system coredns -o yaml
+```
+
+### 3. Test DNS Resolution from Inside the Cluster
+
+```bash
+# Test cluster-internal DNS
+kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
+  nslookup kubernetes.default.svc.cluster.local
+
+# Test external DNS resolution
+kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
+  nslookup google.com
+```
+
+### 4. Check Upstream DNS Reachability
+
+Talos uses a DNS proxy at `169.254.116.108`. If CoreDNS can't resolve external names:
+
+```bash
+# Check if the Talos DNS proxy is responding
+talosctl --talosconfig <talosconfig-path> get resolvers --nodes <node-ip>
+```
+
+## LAN DNS (dnsmasq on Wild Central)
+
+If devices on your LAN can't resolve Wild Cloud domains (e.g., `gitea.cloud.example.com`):
+
+### 1. Check dnsmasq Status
+
+```bash
+wild dns status
+```
+
+### 2. View Current Configuration
+
+```bash
+wild dns config
+```
+
+### 3. Test LAN DNS Resolution
+
+From a device on the LAN:
+```bash
+# Query Wild Central directly
+nslookup gitea.cloud.example.com <wild-central-ip>
+
+# Compare with public DNS
+nslookup gitea.cloud.example.com 8.8.8.8
+```
+
+### 4. Restart dnsmasq
+
+```bash
+wild dns restart
+```
+
+### 5. Regenerate Configuration
+
+If DNS entries are missing or stale:
+```bash
+# Preview changes
+wild dns update --dry-run
+
+# Apply
+wild dns update
+```
+
+## Common Issues
+
+**LAN devices can't resolve Wild Cloud domains**: Ensure your router is configured to use Wild Central's IP as its DNS server.
+
+**Pods can resolve cluster services but not external domains**: Check CoreDNS upstream forwarder configuration and Talos DNS proxy health.
+
+**DNS works but only after a long delay**: Check for timeout issues in CoreDNS forwarder chain. Verify the external resolver configured in `cluster.internalDns.externalResolver`.
+
+## Related Guides
+
+- [Troubleshoot Service Visibility](troubleshoot-visibility.md) — Full external access troubleshooting
+- [Cluster Networking Health](cluster-networking-health.md) — DNS is item #5-6 on the checklist
--- a/docs/guides/troubleshoot-service-connectivity.md
+++ b/docs/guides/troubleshoot-service-connectivity.md
@@ -1,18 +1,67 @@
 # Troubleshoot Service Connectivity

-If services can't communicate:
+If services within the cluster can't communicate with each other:

-1. Check network policies:
-   ```bash
-   kubectl get networkpolicies -A
-   ```
+## 1. Check Network Policies

-2. Verify service endpoints:
-   ```bash
-   kubectl get endpoints -n <namespace>
-   ```
+```bash
+kubectl get networkpolicies -A
+```

-3. Test connectivity from within the cluster:
-   ```bash
-   kubectl run -i --tty --rm debug --image=busybox --restart=Never -- wget -O- <service-name>.<namespace>
-   ```
+Wild Cloud doesn't create restrictive network policies by default, but CrowdSec or custom policies may be blocking traffic.
+
+## 2. Verify Service Endpoints
+
+```bash
+# Check that the service has endpoints
+kubectl get endpoints -n <namespace>
+
+# A service with no endpoints means no pods match its selector
+kubectl describe svc <service-name> -n <namespace>
+```
+
+## 3. Test Connectivity from Within the Cluster
+
+```bash
+# Start a debug pod
+kubectl run -i --tty --rm debug --image=busybox --restart=Never -- sh
+
+# Inside the pod:
+# Test DNS resolution
+nslookup <service-name>.<namespace>.svc.cluster.local
+
+# Test HTTP connectivity
+wget -O- http://<service-name>.<namespace>.svc.cluster.local:<port>
+```
+
+## 4. Check Cross-Node Connectivity
+
+If services on different nodes can't communicate:
+
+```bash
+# Verify Flannel (CNI) pods are running on every node
+kubectl get pods -n kube-system -l app=flannel -o wide
+
+# Check for stale VXLAN tunnels
+talosctl --talosconfig <talosconfig-path> get links --nodes <node-ip> | grep flannel
+```
+
+## 5. Check kube-proxy
+
+```bash
+# Verify kube-proxy is running on all nodes
+kubectl get pods -n kube-system -l k8s-app=kube-proxy -o wide
+```
+
+## Common Issues
+
+**App can't reach its database**: Check that the database pod is running, the service name matches what the app expects, and the database namespace is correct.
+
+**Intermittent connectivity failures**: Often caused by a Flannel pod crash or stale routing. Restart the Flannel pod on the affected node.
+
+**CrowdSec blocking legitimate traffic**: Check CrowdSec decisions and bouncer status. See the CrowdSec service logs: `wild service logs crowdsec`.
+
+## Related Guides
+
+- [Cluster Networking Health](cluster-networking-health.md) — Full networking stack checklist
+- [Troubleshoot DNS](troubleshoot-dns.md) — If the issue is DNS-related
--- a/docs/guides/troubleshoot-tls-certificates.md
+++ b/docs/guides/troubleshoot-tls-certificates.md
@@ -1,24 +1,96 @@
 # Troubleshoot TLS Certificates

-If services show invalid certificates:
+Wild Cloud uses cert-manager with Let's Encrypt for TLS certificates. Two shared wildcard certificates are issued and copied to app namespaces during deployment:

-1. Check certificate status:
-   ```bash
-   kubectl get certificates -A
-   ```
+- `wildcard-wild-cloud-tls` — public domain (e.g., `*.cloud.example.com`)
+- `wildcard-internal-wild-cloud-tls` — internal domain (e.g., `*.internal.cloud.example.com`)

-2. Examine certificate details:
-   ```bash
-   kubectl describe certificate <cert-name> -n <namespace>
-   ```
+## 1. Check Certificate Status

-3. Check for cert-manager issues:
-   ```bash
-   kubectl get pods -n cert-manager
-   kubectl logs -l app=cert-manager -n cert-manager
-   ```
+```bash
+kubectl get certificates -A
+```

-4. Verify the Cloudflare API token is correctly set up:
-   ```bash
-   kubectl get secret cloudflare-api-token -n internal
-   ```
+Look for `Ready: True`. If `False`, describe the certificate for details:
+
+```bash
+kubectl describe certificate <cert-name> -n cert-manager
+```
+
+## 2. Check cert-manager Pods
+
+```bash
+kubectl get pods -n cert-manager
+kubectl logs -l app=cert-manager -n cert-manager | tail -50
+```
+
+## 3. Check Certificate Orders and Challenges
+
+```bash
+# Check pending orders
+kubectl get orders -A
+
+# Check active challenges
+kubectl get challenges -A
+
+# Describe a failing challenge for details
+kubectl describe challenge <challenge-name> -n cert-manager
+```
+
+## 4. Verify the Cloudflare API Token
+
+cert-manager uses DNS-01 validation via Cloudflare. Verify the token is present:
+
+```bash
+kubectl get secret cloudflare-api-token -n cert-manager
+```
+
+If the token is missing or invalid, check your secrets and redeploy cert-manager:
+
+```bash
+wild service install cert-manager
+```
+
+## 5. Check the ClusterIssuer
+
+```bash
+kubectl get clusterissuers
+kubectl describe clusterissuer letsencrypt-prod
+```
+
+Look for `Status: True` on the Ready condition. If using staging for testing:
+
+```bash
+kubectl describe clusterissuer letsencrypt-staging
+```
+
+## 6. Force Certificate Renewal
+
+If a certificate is stuck, delete it and let cert-manager re-issue:
+
+```bash
+kubectl delete certificate <cert-name> -n cert-manager
+```
+
+cert-manager will automatically create a new certificate request.
+
+## 7. Repair Certificates Script
+
+cert-manager includes a repair script for bulk certificate issues:
+
+```bash
+# If cert-manager was installed via Wild Cloud
+kubectl exec -n cert-manager deploy/cert-manager -- /scripts/repair-certificates.sh
+```
+
+## Common Issues
+
+**Challenge fails with NXDOMAIN**: ExternalDNS hasn't created the DNS record yet, or Cloudflare zone ID is wrong. Check `cluster.certManager.cloudflare.zoneID` in config.
+
+**Rate limited by Let's Encrypt**: Production rate limits are 50 certificates per domain per week. Switch to `letsencrypt-staging` for testing.
+
+**Certificate exists but app shows invalid cert**: The wildcard secret may not have been copied to the app namespace. Redeploy the app: `wild app deploy <app>`.
+
+## Related Guides
+
+- [Troubleshoot Service Visibility](troubleshoot-visibility.md) — TLS is one layer of the visibility stack
--- a/docs/guides/troubleshoot-visibility.md
+++ b/docs/guides/troubleshoot-visibility.md
@@ -6,10 +6,10 @@ This guide covers common issues with accessing services from outside the cluster

 External access to your services might fail for several reasons:

-1. **DNS Resolution Issues** - Domain names not resolving to the correct IP address
-2. **Network Connectivity Issues** - Traffic can't reach the cluster's external IP
-3. **TLS Certificate Issues** - Invalid or missing certificates
-4. **Ingress/Service Configuration Issues** - Incorrectly configured routing
+1. **DNS Resolution Issues** — Domain names not resolving to the correct IP address
+2. **Network Connectivity Issues** — Traffic can't reach the cluster's external IP
+3. **TLS Certificate Issues** — Invalid or missing certificates
+4. **Ingress/Service Configuration Issues** — Incorrectly configured routing

 ## Diagnostic Steps

@@ -18,8 +18,8 @@ External access to your services might fail for several reasons:
 **Symptoms:**

 - Browser shows "site cannot be reached" or "server IP address could not be found"
- `ping` or `nslookup` commands fail for your domain
- Your service DNS records don't appear in CloudFlare or your DNS provider
+- `nslookup` fails for your domain
+- DNS records don't appear in Cloudflare

 **Checks:**

@@ -31,35 +31,34 @@ nslookup yourservice.yourdomain.com
 kubectl get pods -n externaldns

 # Check ExternalDNS logs for errors
-kubectl logs -n externaldns -l app=external-dns  < /dev/null |  grep -i error
+kubectl logs -n externaldns -l app=external-dns | grep -i error
 kubectl logs -n externaldns -l app=external-dns | grep -i "your-service-name"

-# Check if CloudFlare API token is configured correctly
+# Verify the Cloudflare API token secret exists
 kubectl get secret cloudflare-api-token -n externaldns
 ```

 **Common Issues:**

-a) **ExternalDNS Not Running**: The ExternalDNS pod is not running or has errors.
+a) **ExternalDNS Not Running**: The pod is not running or has errors.

-b) **Cloudflare API Token Issues**: The API token is invalid, expired, or doesn't have the right permissions.
+b) **Cloudflare API Token Issues**: Token is invalid, expired, or lacks permissions.

-c) **Domain Filter Mismatch**: ExternalDNS is configured with a `--domain-filter` that doesn't match your domain.
+c) **Domain Filter Mismatch**: ExternalDNS `--domain-filter` doesn't match your domain.

-d) **Annotations Missing**: Service or Ingress is missing the required ExternalDNS annotations.
+d) **Annotations Missing**: Ingress is missing the required ExternalDNS annotations.

 **Solutions:**

 ```bash
-# 1. Recreate CloudFlare API token secret
+# 1. Recreate Cloudflare API token secret
 kubectl create secret generic cloudflare-api-token \
  --namespace externaldns \
  --from-literal=api-token="your-api-token" \
  --dry-run=client -o yaml | kubectl apply -f -

-# 2. Check and set proper annotations on your Ingress:
-kubectl annotate ingress your-ingress -n your-namespace \
-  external-dns.alpha.kubernetes.io/hostname=your-service.your-domain.com
+# 2. Verify ingress annotations
+kubectl get ingress -n <app-namespace> -o yaml | grep external-dns

 # 3. Restart ExternalDNS
 kubectl rollout restart deployment -n externaldns external-dns
@@ -82,26 +81,31 @@ kubectl get pods -n metallb-system
 # Check MetalLB IP address pool
 kubectl get ipaddresspools.metallb.io -n metallb-system

-# Verify the service has an external IP
-kubectl get svc -n your-namespace your-service
+# Verify the Traefik service has an external IP
+kubectl get svc -n traefik
 ```

 **Common Issues:**

-a) **MetalLB Configuration**: The IP pool doesn't match your network or is exhausted.
+a) **MetalLB Configuration**: IP pool doesn't match your network or is exhausted.

-b) **Firewall Issues**: Firewall is blocking traffic to your cluster's external IP.
+b) **MetalLB L2 Announcements**: Stale ServiceL2Status entries blocking ARP announcements.

-c) **Router Configuration**: NAT or port forwarding issues if using a router.
+c) **Firewall Issues**: Firewall blocking traffic to the cluster's load balancer IP.
+
+d) **Router Configuration**: NAT or port forwarding issues.

 **Solutions:**

 ```bash
-# 1. Check and update MetalLB configuration
-kubectl apply -f infrastructure_setup/metallb/metallb-pool.yaml
+# Check MetalLB L2 advertisement status
+kubectl get servicel2statuses.metallb.io -n metallb-system

-# 2. Check service external IP assignment
-kubectl describe svc -n your-namespace your-service
+# Verify MetalLB speaker pods are running on all nodes
+kubectl get pods -n metallb-system -l component=speaker -o wide
+
+# Reinstall MetalLB if configuration is wrong
+wild service install metallb
 ```

 ### 3. Check TLS Certificates
@@ -110,40 +114,42 @@ kubectl describe svc -n your-namespace your-service

 - Browser shows certificate errors
 - "Your connection is not private" warnings
- Cert-manager logs show errors
+- cert-manager logs show errors

 **Checks:**

 ```bash
 # Check certificate status
-kubectl get certificates -A
+kubectl get certificates -n cert-manager

 # Check cert-manager logs
-kubectl logs -n cert-manager -l app=cert-manager
+kubectl logs -n cert-manager -l app=cert-manager | tail -30

-# Check if your ingress is using the correct certificate
-kubectl get ingress -n your-namespace your-ingress -o yaml
+# Check if the ingress is using the correct TLS secret
+kubectl get ingress -n <app-namespace> -o yaml | grep secretName
 ```

 **Common Issues:**

-a) **Certificate Issuance Failures**: DNS validation or HTTP validation failing.
+a) **Certificate Issuance Failures**: DNS-01 validation failing (Cloudflare token or zone ID wrong).

-b) **Wrong Secret Referenced**: Ingress is referencing a non-existent certificate secret.
+b) **Wrong Secret Referenced**: Ingress referencing a non-existent secret.

-c) **Expired Certificate**: Certificate has expired and wasn't renewed.
+c) **Secret Not Copied**: Wildcard TLS secret not copied to the app namespace during deploy.

 **Solutions:**

 ```bash
-# 1. Check and recreate certificates
-kubectl apply -f infrastructure_setup/cert-manager/wildcard-certificate.yaml
+# Force re-issue certificates
+kubectl delete certificate wildcard-wild-cloud-tls -n cert-manager
+# cert-manager will automatically re-create it

-# 2. Update ingress to use correct secret
-kubectl patch ingress your-ingress -n your-namespace --type=json \
-  -p='[{"op": "replace", "path": "/spec/tls/0/secretName", "value": "correct-secret-name"}]'
+# Redeploy the app to copy TLS secrets to its namespace
+wild app deploy <app>
 ```

+See [Troubleshoot TLS Certificates](troubleshoot-tls-certificates.md) for detailed cert debugging.
+
 ### 4. Check Ingress Configuration

 **Symptoms:**
@@ -156,18 +162,18 @@ kubectl patch ingress your-ingress -n your-namespace --type=json \

 ```bash
 # Check ingress status
-kubectl get ingress -n your-namespace
+kubectl get ingress -n <app-namespace>

 # Check Traefik logs
-kubectl logs -n kube-system -l app.kubernetes.io/name=traefik
+kubectl logs -n traefik -l app=traefik | tail -30

-# Check ingress configuration
-kubectl describe ingress -n your-namespace your-ingress
+# Check ingress details
+kubectl describe ingress -n <app-namespace> <ingress-name>
 ```

 **Common Issues:**

-a) **Incorrect Service Targeting**: Ingress is pointing to wrong service or port.
+a) **Incorrect Service Targeting**: Ingress pointing to wrong service or port.

 b) **Traefik Configuration**: IngressClass or middleware issues.

@@ -176,71 +182,86 @@ c) **Path Configuration**: Incorrect path prefixes or regex.
 **Solutions:**

 ```bash
-# 1. Verify ingress configuration
-kubectl edit ingress -n your-namespace your-ingress
+# Verify the referenced service exists and has endpoints
+kubectl get svc -n <app-namespace>
+kubectl get endpoints -n <app-namespace>

-# 2. Check that the referenced service exists
-kubectl get svc -n your-namespace
-
-# 3. Restart Traefik if needed
-kubectl rollout restart deployment -n kube-system traefik
+# Restart Traefik
+kubectl rollout restart deployment -n traefik traefik
 ```

 ## Advanced Diagnostics

-For more complex issues, you can use port-forwarding to test services directly:
+### Port-Forward to Test Directly
+
+Bypass ingress and test the service directly:

 ```bash
-# Port-forward the service directly
-kubectl port-forward -n your-namespace svc/your-service 8080:80
+# Port-forward the service
+kubectl port-forward -n <app-namespace> svc/<service-name> 8080:80

-# Then test locally
+# Test locally
 curl http://localhost:8080
 ```

-You can also deploy a debug pod to test connectivity from inside the cluster:
+### Debug Pod for In-Cluster Testing

 ```bash
 # Start a debug pod
 kubectl run -i --tty --rm debug --image=busybox --restart=Never -- sh

 # Inside the pod, test DNS and connectivity
-nslookup your-service.your-namespace.svc.cluster.local
-wget -O- http://your-service.your-namespace.svc.cluster.local
+nslookup <service-name>.<namespace>.svc.cluster.local
+wget -O- http://<service-name>.<namespace>.svc.cluster.local
 ```

+### LAN DNS (dnsmasq)
+
+If services are reachable from outside the LAN but not from within:
+
+```bash
+# Check dnsmasq status on Wild Central
+wild dns status
+
+# Verify dnsmasq resolves your domain correctly
+nslookup yourservice.yourdomain.com <wild-central-ip>
+
+# Regenerate dnsmasq config if entries are stale
+wild dns update
+```
+
+This is typically a hairpin NAT issue — dnsmasq on Wild Central resolves LAN-local domains to the cluster's load balancer IP so internal devices don't need to go through the router.
+
 ## ExternalDNS Specifics

-ExternalDNS can be particularly troublesome. Here are specific debugging steps:
-
-1. **Check Log Level**: Set `--log-level=debug` for more detailed logs
+1. **Check Log Level**: Set `--log-level=debug` for detailed logs
 2. **Check Domain Filter**: Ensure `--domain-filter` includes your domain
-3. **Check Provider**: Ensure `--provider=cloudflare` (or your DNS provider)
-4. **Verify API Permissions**: CloudFlare token needs Zone.Zone and Zone.DNS permissions
+3. **Check Provider**: Ensure `--provider=cloudflare`
+4. **Verify API Permissions**: Cloudflare token needs Zone.Zone (Read) and Zone.DNS (Edit) permissions
 5. **Check TXT Records**: ExternalDNS uses TXT records for ownership tracking

 ```bash
-# Restart with verbose logging
-kubectl set env deployment/external-dns -n externaldns -- --log-level=debug
-
 # Check for specific domain errors
 kubectl logs -n externaldns -l app=external-dns | grep -i yourservice.yourdomain.com
 ```

-## CloudFlare Specific Issues
+## Cloudflare Specific Issues

-When using CloudFlare, additional issues may arise:
+1. **API Rate Limiting**: Cloudflare may rate limit frequent API calls
+2. **DNS Propagation**: Changes may take time to propagate through Cloudflare's network
+3. **Proxied Records**: The `external-dns.alpha.kubernetes.io/cloudflare-proxied` annotation controls whether Cloudflare proxies traffic
+4. **API Token Permissions**: Token must have Zone:Zone:Read and Zone:DNS:Edit permissions
+5. **Zone Detection**: If using subdomains, ensure the parent domain is in the domain filter

-1. **API Rate Limiting**: CloudFlare may rate limit frequent API calls
-2. **DNS Propagation**: Changes may take time to propagate through CloudFlare's CDN
-3. **Proxied Records**: The `external-dns.alpha.kubernetes.io/cloudflare-proxied` annotation controls whether CloudFlare proxies traffic
-4. **Access Restrictions**: CloudFlare Access or Page Rules may restrict access
-5. **API Token Permissions**: The token must have Zone:Zone:Read and Zone:DNS:Edit permissions
-6. **Zone Detection**: If using subdomains, ensure the parent domain is included in the domain filter
-
-Check CloudFlare dashboard for:
+Check the Cloudflare dashboard for:

 - DNS record existence
 - API access logs
 - DNS settings including proxy status
- Any error messages or rate limit warnings
+- Rate limit warnings
+
+## Related Guides
+
+- [Troubleshoot DNS](troubleshoot-dns.md) — Cluster and LAN DNS issues
+- [Troubleshoot TLS Certificates](troubleshoot-tls-certificates.md) — Certificate-specific debugging
+- [Cluster Networking Health](cluster-networking-health.md) — Full networking stack checklist
--- a/docs/guides/upgrade-applications.md
+++ b/docs/guides/upgrade-applications.md
@@ -1,3 +1,173 @@
 # Upgrade Applications

-TBD
+This guide covers upgrading Wild Cloud applications to newer versions from the Wild Directory.
+
+## Check for Available Updates
+
+### Web UI
+
+The **Apps > Installed** page shows update indicators when a newer version is available in the Wild Directory. Click an app to see the current and available versions.
+
+### CLI
+
+```bash
+# Show the upgrade plan for a specific app
+wild app upgrade-plan gitea
+```
+
+The upgrade plan shows:
+- Current installed version
+- Target version in the Wild Directory
+- Whether the upgrade is direct or requires waypoints
+- Whether a backup is recommended or required
+
+## Simple Upgrade (No Breaking Changes)
+
+Most app updates are straightforward — a new container image tag with compatible config:
+
+```bash
+# Update the app from the Wild Directory
+wild app update gitea
+```
+
+This will:
+1. Fetch the latest version from the Wild Directory
+2. Merge any new defaultConfig fields with your existing config
+3. Recompile templates
+4. Deploy the updated manifests
+
+### Step-by-Step (Review Before Deploy)
+
+For more control:
+
+```bash
+# Check what will change
+wild app upgrade-plan gitea
+
+# Fetch updated files without deploying
+wild app update gitea --no-deploy
+
+# Review the changes in your instance data directory
+# (e.g., diff the compiled manifests)
+
+# Deploy when ready
+wild app deploy gitea
+```
+
+## Waypoint Upgrades (Breaking Changes)
+
+Some apps require stepping through intermediate versions due to database schema changes or incompatible config formats. The upgrade system handles this automatically.
+
+### How Waypoints Work
+
+If an app defines upgrade routing rules (in `app.yaml`), the system computes a multi-step upgrade path:
+
+```
+Current: 1.5.0 → Waypoint: 2.0.0 (slot "2") → Target: 3.0.0 (slot "3")
+```
+
+Each step may include:
+- **Pre-deploy migrations**: Database schema changes needed before the new version starts
+- **Post-deploy migrations**: Data backfills or cleanup after the new version is running
+- **Config migrations**: Automatic renaming of config keys
+
+### Running a Waypoint Upgrade
+
+```bash
+# View the full upgrade plan
+wild app upgrade-plan discourse
+
+# Run the upgrade (handles all steps automatically)
+wild app update discourse
+```
+
+The system processes each waypoint in order, running migrations at each step.
+
+## Backup Before Upgrading
+
+The upgrade plan will indicate when backups are recommended or required:
+
+- **Required**: The upgrade won't proceed without a backup. Create one first:
+  ```bash
+  wild backup gitea
+  ```
+
+- **Recommended**: The upgrade will proceed but warns you. Create a backup for safety:
+  ```bash
+  wild backup gitea
+  ```
+
+## Rolling Back After a Bad Upgrade
+
+If an upgrade causes problems, restore from your pre-upgrade backup:
+
+```bash
+# List available backups
+wild backup list gitea
+
+# Restore the pre-upgrade backup
+wild restore gitea <pre-upgrade-timestamp> --auto
+```
+
+If you didn't take a backup, you can try reverting to the previous version:
+
+```bash
+# Re-add the old version (if the Wild Directory still has it as a waypoint)
+wild app add gitea --version <old-version>
+wild app deploy gitea
+```
+
+## Infrastructure Service Upgrades
+
+Infrastructure services (MetalLB, Traefik, cert-manager, etc.) follow the same update process:
+
+```bash
+# Check status
+wild service status traefik
+
+# Update
+wild service update traefik
+```
+
+Or reinstall from the Wild Directory:
+
+```bash
+wild service install traefik --fetch
+```
+
+## Troubleshooting
+
+### App won't start after upgrade
+
+```bash
+# Check pod status and logs
+kubectl get pods -n <app>
+kubectl logs -n <app> deploy/<app>
+
+# Check events for scheduling or resource issues
+kubectl get events -n <app> --sort-by='.lastTimestamp'
+```
+
+### Database migration failed
+
+Check the migration job:
+
+```bash
+kubectl get jobs -n <app>
+kubectl logs job/<migration-job-name> -n <app>
+```
+
+Migration jobs are designed to be idempotent — you can re-run the upgrade after fixing the issue.
+
+### Config key errors after upgrade
+
+If templates reference old config keys, the upgrade may have included `configMigrations` that didn't run. Check the app's manifest for renamed keys and update your config manually:
+
+```bash
+wild config show | grep <app>
+```
+
+## Related Guides
+
+- [Making Backups](making-backups.md) — Always backup before upgrading
+- [Restoring Backups](restoring-backups.md) — Rolling back after a bad upgrade
--- a/docs/guides/upgrade-kubernetes.md
+++ b/docs/guides/upgrade-kubernetes.md
@@ -1,3 +1,66 @@
 # Upgrade Kubernetes

-TBD
+In Wild Cloud, Kubernetes is bundled with Talos Linux. Upgrading Kubernetes means upgrading Talos to a version that includes the desired Kubernetes release.
+
+## How It Works
+
+Each Talos version ships with a specific Kubernetes version. When you upgrade a node's Talos version, the Kubernetes components on that node are upgraded automatically.
+
+Check which Kubernetes version is bundled with a Talos release at the [Talos release notes](https://www.talos.dev/latest/introduction/what-is-new/).
+
+## Check Current Versions
+
+```bash
+# Current Kubernetes version
+wild cluster status
+
+# Current Talos and Kubernetes versions per node
+wild node list
+
+# kubectl version
+kubectl version
+```
+
+## Upgrade Process
+
+Since Kubernetes upgrades are part of Talos upgrades, follow the [Upgrade Talos](upgrade-talos.md) guide. The key points:
+
+1. **Identify the target Talos version** that includes the Kubernetes version you want
+2. **Upgrade worker nodes first**, one at a time
+3. **Upgrade control plane nodes last**, one at a time, verifying cluster health between each
+4. **Verify** the cluster is healthy after all nodes are upgraded
+
+```bash
+# After upgrading all nodes, verify the Kubernetes version
+wild cluster status
+kubectl version
+```
+
+## Kubernetes Version Skew
+
+Talos enforces Kubernetes version compatibility automatically. Within a multi-node cluster during a rolling upgrade:
+
+- Control plane components can differ by at most 1 minor version
+- kubelet can be up to 2 minor versions behind the API server
+
+Upgrade nodes one at a time to stay within these bounds.
+
+## Troubleshooting
+
+### Pods stuck after Kubernetes upgrade
+
+Some workloads may need to be restarted after a Kubernetes upgrade:
+
+```bash
+# Restart all deployments in a namespace
+kubectl rollout restart deployment -n <namespace>
+```
+
+### API incompatibilities
+
+If you skip multiple Kubernetes minor versions, deprecated APIs may break manifests. Check the [Kubernetes deprecation guide](https://kubernetes.io/docs/reference/using-api/deprecation-guide/) for removed APIs.
+
+## Related Guides
+
+- [Upgrade Talos](upgrade-talos.md) — The actual upgrade procedure
+- [Troubleshoot Cluster](troubleshoot-cluster.md) — Post-upgrade issues
--- a/docs/guides/upgrade-talos.md
+++ b/docs/guides/upgrade-talos.md
@@ -1,3 +1,147 @@
 # Upgrade Talos

-TBD
+This guide covers upgrading Talos Linux on your cluster nodes. Talos upgrades update the OS and core components on each node individually.
+
+## Prerequisites
+
+- Cluster is healthy: `wild cluster health`
+- You know your current Talos version: `wild node list`
+- You know the target version: `wild talos versions`
+
+## Check Available Versions
+
+```bash
+# List stable Talos versions
+wild talos versions
+
+# Include pre-release versions
+wild talos versions --all
+```
+
+## Validate Schematic Compatibility
+
+Before upgrading, verify your node schematic (system extensions like NVIDIA drivers, NFS, etc.) is compatible with the target version:
+
+```bash
+wild talos validate <schematic-id> <target-version>
+```
+
+If extensions are missing for the target version, you may need to create a new schematic at [Image Factory](https://factory.talos.dev).
+
+## Upgrade a Node
+
+```bash
+wild node upgrade <hostname> <target-version>
+```
+
+This will:
+
+1. Validate the schematic-version compatibility
+2. Check that the local `talosctl` is compatible (max 1 minor version gap)
+3. Cordon the node (prevent new workloads from scheduling)
+4. Drain the node (evict running workloads)
+5. Upgrade Talos using the Image Factory installer
+6. Reboot the node
+7. Update the node's version in config.yaml
+
+The operation runs asynchronously. Monitor progress with:
+
+```bash
+wild operation list
+```
+
+## Upgrade Order
+
+For multi-node clusters, upgrade nodes one at a time:
+
+1. **Worker nodes first** — least disruptive
+2. **Control plane nodes last** — one at a time, verify etcd health between each
+
+```bash
+# Upgrade worker nodes
+wild node upgrade worker-1 v1.11.5
+# Wait for worker-1 to be Ready
+wild node upgrade worker-2 v1.11.5
+
+# Then control plane nodes (one at a time)
+wild node upgrade control-1 v1.11.5
+# Verify etcd and cluster health
+wild cluster health
+wild node upgrade control-2 v1.11.5
+wild cluster health
+wild node upgrade control-3 v1.11.5
+```
+
+## Rollback
+
+Talos uses an A/B image scheme — the previous version is always available. If an upgrade causes problems:
+
+```bash
+wild node rollback <hostname>
+```
+
+This reverts the node to its previous Talos version and reboots.
+
+## Upgrade talosctl
+
+If the target Talos version requires a newer talosctl (the client must be within 1 minor version of the node), upgrade talosctl on Wild Central first:
+
+```bash
+# Check current talosctl version
+wild talos client
+
+# Upgrade talosctl
+wild talos client upgrade <version>
+```
+
+## Update the Instance Schematic
+
+To change the default Talos schematic and version for your instance (used when adding new nodes):
+
+```bash
+wild config set cluster.nodes.talos.version v1.11.5
+wild config set cluster.nodes.talos.schematicId <new-schematic-id>
+```
+
+Or via the API:
+```bash
+curl -X PUT http://localhost:5055/api/v1/instances/{instance}/schematic \
+  -H "Content-Type: application/json" \
+  -d '{"schematicId": "<id>", "version": "v1.11.5"}'
+```
+
+## Troubleshooting
+
+### Node stuck after upgrade
+
+```bash
+# Check Talos services
+talosctl --talosconfig <talosconfig-path> services --nodes <node-ip>
+
+# Check Talos logs
+talosctl --talosconfig <talosconfig-path> logs kubelet --nodes <node-ip>
+
+# If the node won't come back, rollback
+wild node rollback <hostname>
+```
+
+### talosctl version mismatch
+
+```
+Error: talosctl version too old for target version
+```
+
+Upgrade talosctl first: `wild talos client upgrade <version>`
+
+### Schematic not available for target version
+
+Create a new schematic at [factory.talos.dev](https://factory.talos.dev) with the extensions you need for the new version, then use the new schematic ID:
+
+```bash
+wild node upgrade <hostname> <version> --schematic-id <new-id>
+```
+
+## Related Guides
+
+- [Upgrade Kubernetes](upgrade-kubernetes.md) — Kubernetes version upgrades
+- [Troubleshoot Cluster](troubleshoot-cluster.md) — When upgrades cause issues
--- a/docs/guides/upgrade-wild-cloud.md
+++ b/docs/guides/upgrade-wild-cloud.md
@@ -1,3 +1,70 @@
 # Upgrade Wild Cloud

-TBD
+This guide covers upgrading Wild Cloud Central itself — the API, CLI, and web app that run on your Wild Central device.
+
+## Check Current Version
+
+```bash
+wild version
+```
+
+This shows the CLI version and, if connected, the API version.
+
+## Upgrade via apt
+
+If Wild Cloud Central was installed via the `.deb` package:
+
+```bash
+# Download the latest .deb package from the releases page
+# https://git.civilsociety.dev/wild-cloud/wild-cloud/releases
+
+# Install the update
+sudo dpkg -i wild-cloud-central_<version>_<arch>.deb
+sudo apt-get install -f  # Fix any dependency issues
+
+# Restart the service
+sudo systemctl restart wild-cloud-central
+```
+
+## Verify the Upgrade
+
+```bash
+# Check the service is running
+sudo systemctl status wild-cloud-central
+
+# Check the version
+wild version
+
+# Verify API is accessible
+wild daemon status
+```
+
+## What Gets Upgraded
+
+The Wild Cloud Central package includes:
+
+- **Wild API** — the daemon that manages your instances
+- **Wild CLI** — the `wild` command-line tool
+- **Wild Web App** — the browser-based management interface
+
+All three components share the same version number.
+
+## Data Compatibility
+
+Wild Cloud upgrades are backward-compatible with your instance data. Your `config.yaml`, `secrets.yaml`, compiled manifests, and Kubernetes state are not modified by the upgrade.
+
+If a new version introduces new configuration fields, they will use defaults until you configure them.
+
+## Downgrading
+
+To downgrade, install the older `.deb` package:
+
+```bash
+sudo dpkg -i wild-cloud-central_<older-version>_<arch>.deb
+sudo systemctl restart wild-cloud-central
+```
+
+## Related Guides
+
+- [Upgrade Talos](upgrade-talos.md) — Upgrading the OS on cluster nodes
+- [Upgrade Applications](upgrade-applications.md) — Upgrading deployed apps
--- a/web/src/components/BackupRestoreModal.tsx
+++ b/web/src/components/BackupRestoreModal.tsx
@@ -1,4 +1,4 @@
-import { useState, useEffect } from 'react';
+import { useState } from 'react';
 import {
  Dialog,
  DialogContent,
@@ -9,16 +9,7 @@ import {
 } from './ui/dialog';
 import { Button } from './ui/button';
 import { Label } from './ui/label';
-import {
-  Select,
-  SelectContent,
-  SelectItem,
-  SelectTrigger,
-  SelectValue,
-} from './ui/select';
-import { Loader2, AlertCircle, Clock, HardDrive, CheckCircle, Package } from 'lucide-react';
-import { useDeployedApps } from '../hooks/useApps';
-import { useAppBackups } from '../hooks/useBackups';
+import { Loader2, AlertCircle, Clock, HardDrive, CheckCircle } from 'lucide-react';

 interface Backup {
  timestamp: string;
@@ -29,11 +20,11 @@ interface BackupRestoreModalProps {
  isOpen: boolean;
  onClose: () => void;
  mode: 'backup' | 'restore';
-  appName?: string;
+  appName: string;
  instanceName?: string;
  backups?: Backup[];
  isLoading?: boolean;
-  onConfirm: (backupId?: string, appName?: string) => void;
+  onConfirm: (backupId?: string) => void;
  isPending?: boolean;
 }

@@ -41,56 +32,25 @@ export function BackupRestoreModal({
  isOpen,
  onClose,
  mode,
-  appName: initialAppName,
-  instanceName,
+  appName,
  backups = [],
  isLoading = false,
  onConfirm,
  isPending = false,
 }: BackupRestoreModalProps) {
  const [selectedBackupTimestamp, setSelectedBackupTimestamp] = useState<string | null>(null);
-  const [selectedApp, setSelectedApp] = useState<string>(initialAppName || '');
-
-  // For restore mode when no app is pre-selected
-  const { apps: deployedApps, isLoading: isLoadingApps } = useDeployedApps(
-    mode === 'restore' && !initialAppName ? instanceName : null
-  );
-
-  // Get backups for selected app
-  const { backups: appBackups, isLoading: isLoadingBackups } = useAppBackups(
-    mode === 'restore' && selectedApp ? instanceName : null,
-    selectedApp || null
-  );
-
-  // Update selected app when prop changes
-  useEffect(() => {
-    if (initialAppName) {
-      setSelectedApp(initialAppName);
-    }
-  }, [initialAppName]);
-
-  // Use provided backups or fetch them
-  const backupsToShow = initialAppName ? backups : (
-    appBackups?.filter(b => b.status === 'backed_up').map(b => ({
-      timestamp: b.timestamp,
-      size: undefined, // Size computed at call site
-    })) || []
-  );
-
-  const isLoadingData = isLoading || isLoadingApps || isLoadingBackups;

  const handleConfirm = () => {
    if (mode === 'backup') {
      onConfirm();
-    } else if (mode === 'restore' && selectedBackupTimestamp && selectedApp) {
-      onConfirm(selectedBackupTimestamp, selectedApp);
+    } else if (mode === 'restore' && selectedBackupTimestamp) {
+      onConfirm(selectedBackupTimestamp);
    }
    onClose();
  };

  const formatTimestamp = (timestamp: string) => {
    try {
-      // Handle format: 20260301T090145Z -> 2026-03-01T09:01:45Z
      if (timestamp.match(/^\d{8}T\d{6}Z$/)) {
        const formatted = timestamp.replace(
          /^(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})(\d{2})Z$/,
@@ -98,14 +58,12 @@ export function BackupRestoreModal({
        );
        return new Date(formatted).toLocaleString();
      }
-      // Try standard parsing for other formats
      return new Date(timestamp).toLocaleString();
    } catch {
      return timestamp;
    }
  };

-  // Get relative time
  const getRelativeTime = (timestamp: string) => {
    try {
      let date;
@@ -144,10 +102,8 @@ export function BackupRestoreModal({
          </DialogTitle>
          <DialogDescription>
            {mode === 'backup'
-              ? `Create a backup of the ${initialAppName} application data.`
-              : initialAppName
-                ? `Select a backup to restore for ${initialAppName}.`
-                : 'Select an application and backup to restore.'}
+              ? `Create a backup of the ${appName} application data.`
+              : `Select a backup to restore for ${appName}.`}
          </DialogDescription>
        </DialogHeader>

@@ -160,92 +116,54 @@ export function BackupRestoreModal({
              </p>
            </div>
          ) : (
-            <div className="space-y-4">
-              {/* App Selector (only when no app pre-selected) */}
-              {!initialAppName && (
-                <div className="space-y-2">
-                  <Label htmlFor="app-select">Application</Label>
-                  <Select value={selectedApp} onValueChange={setSelectedApp}>
-                    <SelectTrigger id="app-select">
-                      <SelectValue placeholder="Select an application" />
-                    </SelectTrigger>
-                    <SelectContent>
-                      {isLoadingApps ? (
-                        <div className="flex items-center justify-center p-2">
-                          <Loader2 className="h-4 w-4 animate-spin" />
-                          <span className="ml-2 text-sm">Loading apps...</span>
-                        </div>
-                      ) : deployedApps?.length === 0 ? (
-                        <div className="p-2 text-sm text-muted-foreground text-center">
-                          No apps with backups
-                        </div>
-                      ) : (
-                        deployedApps?.filter((app: any) => app.status === 'deployed').map((app: any) => (
-                          <SelectItem key={app.name} value={app.name}>
-                            <div className="flex items-center gap-2">
-                              <Package className="h-4 w-4" />
-                              {app.name}
-                            </div>
-                          </SelectItem>
-                        ))
-                      )}
-                    </SelectContent>
-                  </Select>
+            <div className="space-y-2">
+              <Label>Select Backup</Label>
+              {isLoading ? (
+                <div className="flex items-center justify-center py-8">
+                  <Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
                </div>
-              )}
-
-              {/* Backup List */}
-              {(selectedApp || initialAppName) && (
-                <div className="space-y-2">
-                  <Label>Select Backup</Label>
-                  {isLoadingData ? (
-                    <div className="flex items-center justify-center py-8">
-                      <Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
-                    </div>
-                  ) : backupsToShow.length === 0 ? (
-                    <div className="text-center py-8 bg-muted rounded-lg">
-                      <AlertCircle className="h-12 w-12 text-muted-foreground mx-auto mb-4" />
-                      <p className="text-sm font-medium">No backups available</p>
-                      <p className="text-xs text-muted-foreground mt-1">
-                        Create a backup first before you can restore
-                      </p>
-                    </div>
-                  ) : (
-                    <div className="space-y-2 max-h-72 overflow-y-auto pr-2">
-                      {backupsToShow.map((backup) => (
-                        <button
-                          key={backup.timestamp}
-                          onClick={() => setSelectedBackupTimestamp(backup.timestamp)}
-                          className={`w-full p-3 rounded-lg border text-left transition-all hover:shadow-md ${
-                            selectedBackupTimestamp === backup.timestamp
-                              ? 'border-primary bg-primary/10 ring-2 ring-primary/20'
-                              : 'border-border hover:bg-accent/50'
-                          }`}
-                        >
-                          <div className="flex items-center justify-between mb-1">
-                            <div className="flex items-center gap-2">
-                              <Clock className="h-4 w-4 text-muted-foreground" />
-                              <span className="text-sm font-medium">
-                                {getRelativeTime(backup.timestamp)}
-                              </span>
-                            </div>
-                            {selectedBackupTimestamp === backup.timestamp && (
-                              <CheckCircle className="h-4 w-4 text-primary" />
-                            )}
-                          </div>
-                          <div className="flex items-center gap-3 text-xs text-muted-foreground">
-                            {backup.size && (
-                              <span className="flex items-center gap-1">
-                                <HardDrive className="h-3 w-3" />
-                                {backup.size}
-                              </span>
-                            )}
-                            <span>{formatTimestamp(backup.timestamp)}</span>
-                          </div>
-                        </button>
-                      ))}
-                    </div>
-                  )}
+              ) : backups.length === 0 ? (
+                <div className="text-center py-8 bg-muted rounded-lg">
+                  <AlertCircle className="h-12 w-12 text-muted-foreground mx-auto mb-4" />
+                  <p className="text-sm font-medium">No backups available</p>
+                  <p className="text-xs text-muted-foreground mt-1">
+                    Create a backup first before you can restore
+                  </p>
+                </div>
+              ) : (
+                <div className="space-y-2 max-h-72 overflow-y-auto pr-2">
+                  {backups.map((backup) => (
+                    <button
+                      key={backup.timestamp}
+                      onClick={() => setSelectedBackupTimestamp(backup.timestamp)}
+                      className={`w-full p-3 rounded-lg border text-left transition-all hover:shadow-md ${
+                        selectedBackupTimestamp === backup.timestamp
+                          ? 'border-primary bg-primary/10 ring-2 ring-primary/20'
+                          : 'border-border hover:bg-accent/50'
+                      }`}
+                    >
+                      <div className="flex items-center justify-between mb-1">
+                        <div className="flex items-center gap-2">
+                          <Clock className="h-4 w-4 text-muted-foreground" />
+                          <span className="text-sm font-medium">
+                            {getRelativeTime(backup.timestamp)}
+                          </span>
+                        </div>
+                        {selectedBackupTimestamp === backup.timestamp && (
+                          <CheckCircle className="h-4 w-4 text-primary" />
+                        )}
+                      </div>
+                      <div className="flex items-center gap-3 text-xs text-muted-foreground">
+                        {backup.size && (
+                          <span className="flex items-center gap-1">
+                            <HardDrive className="h-3 w-3" />
+                            {backup.size}
+                          </span>
+                        )}
+                        <span>{formatTimestamp(backup.timestamp)}</span>
+                      </div>
+                    </button>
+                  ))}
                </div>
              )}
            </div>
@@ -260,7 +178,7 @@ export function BackupRestoreModal({
            onClick={handleConfirm}
            disabled={
              isPending ||
-              (mode === 'restore' && (!selectedBackupTimestamp || !selectedApp || backupsToShow.length === 0))
+              (mode === 'restore' && (!selectedBackupTimestamp || backups.length === 0))
            }
          >
            {isPending ? (
@@ -278,4 +196,4 @@ export function BackupRestoreModal({
      </DialogContent>
    </Dialog>
  );
-}
+}
--- a/web/src/components/CentralComponent.tsx
+++ b/web/src/components/CentralComponent.tsx
@@ -12,7 +12,7 @@ import {
  DialogHeader,
  DialogTitle,
 } from './ui/dialog';
-import { HardDrive, Settings, Clock, CheckCircle, BookOpen, ExternalLink, Loader2, AlertCircle, Database, FolderTree, Mail, Router, Edit2, Check, X, XCircle, Play, RotateCw, Copy, ChevronDown, ChevronUp, Edit, ArrowUpCircle, Terminal } from 'lucide-react';
+import { HardDrive, Settings, CheckCircle, BookOpen, ExternalLink, Loader2, AlertCircle, FolderTree, Mail, Router, Edit2, Check, X, XCircle, Play, RotateCw, Copy, ChevronDown, ChevronUp, Edit, ArrowUpCircle, Terminal } from 'lucide-react';
 import { Badge } from './ui/badge';
 import { useCentralStatus } from '../hooks/useCentralStatus';
 import { useInstanceConfig, useInstanceContext, useConfig } from '../hooks';
@@ -280,23 +280,6 @@ export function CentralComponent() {
    ),
  });

-  const formatUptime = (seconds?: number) => {
-    if (!seconds) return 'Unknown';
-
-    const days = Math.floor(seconds / 86400);
-    const hours = Math.floor((seconds % 86400) / 3600);
-    const minutes = Math.floor((seconds % 3600) / 60);
-    const secs = Math.floor(seconds % 60);
-
-    const parts = [];
-    if (days > 0) parts.push(`${days}d`);
-    if (hours > 0) parts.push(`${hours}h`);
-    if (minutes > 0) parts.push(`${minutes}m`);
-    if (secs > 0 || parts.length === 0) parts.push(`${secs}s`);
-
-    return parts.join(' ');
-  };
-
  // Show error state
  if (statusError) {
    return (
--- a/web/src/components/ClusterNodesComponent.tsx
+++ b/web/src/components/ClusterNodesComponent.tsx
@@ -43,6 +43,7 @@ export function ClusterNodesComponent() {
    updateNode,
    applyNode,
    isApplying,
+    rebootNode,
    refetch
  } = useNodes(currentInstance);

@@ -90,6 +91,7 @@ export function ClusterNodesComponent() {
  const [detectError, setDetectError] = useState<string | null>(null);
  const [discoverSuccess, setDiscoverSuccess] = useState<string | null>(null);
  const [deleteNodeTarget, setDeleteNodeTarget] = useState<string | null>(null);
+  const [rebootNodeTarget, setRebootNodeTarget] = useState<string | null>(null);
  const [showBootstrapModal, setShowBootstrapModal] = useState(false);
  const [bootstrapNode, setBootstrapNode] = useState<{ name: string; ip: string } | null>(null);
  const [drawerState, setDrawerState] = useState<{
@@ -285,6 +287,18 @@ export function ClusterNodesComponent() {
    await deleteNode(hostname);
  };

+  const handleRebootNode = (hostname: string) => {
+    setRebootNodeTarget(hostname);
+  };
+
+  const confirmRebootNode = async () => {
+    if (!rebootNodeTarget) return;
+    const hostname = rebootNodeTarget;
+    setRebootNodeTarget(null);
+    closeDrawer();
+    rebootNode(hostname);
+  };
+
  const handleDiscover = () => {
    setDiscoverError(null);
    setDiscoverSuccess(null);
@@ -595,10 +609,30 @@ export function ClusterNodesComponent() {
          onDelete={drawerState.mode === 'configure' && drawerState.node ? async () => {
            handleDeleteNode(drawerState.node!.hostname);
          } : undefined}
+          onReboot={drawerState.mode === 'configure' && drawerState.node ? () => {
+            handleRebootNode(drawerState.node!.hostname);
+          } : undefined}
          instanceName={currentInstance || ''}
        />
      )}

+      <AlertDialog open={!!rebootNodeTarget} onOpenChange={(open) => { if (!open) setRebootNodeTarget(null); }}>
+        <AlertDialogContent>
+          <AlertDialogHeader>
+            <AlertDialogTitle>Reboot node</AlertDialogTitle>
+            <AlertDialogDescription>
+              This will reboot node {rebootNodeTarget}. The node will restart and rejoin the cluster automatically. Running workloads on this node will be interrupted.
+            </AlertDialogDescription>
+          </AlertDialogHeader>
+          <AlertDialogFooter>
+            <AlertDialogCancel>Cancel</AlertDialogCancel>
+            <AlertDialogAction onClick={confirmRebootNode}>
+              Reboot
+            </AlertDialogAction>
+          </AlertDialogFooter>
+        </AlertDialogContent>
+      </AlertDialog>
+
      <AlertDialog open={!!deleteNodeTarget} onOpenChange={(open) => { if (!open) setDeleteNodeTarget(null); }}>
        <AlertDialogContent>
          <AlertDialogHeader>
--- a/web/src/components/apps/AppDetailPanel.tsx
+++ b/web/src/components/apps/AppDetailPanel.tsx
@@ -36,6 +36,7 @@ import { useAppEnhanced, useAppReadme, useAppEvents, useAppLogs, useAppManifests
 import { apiClient } from '@/services/api/client';
 import { appsApi } from '@/services/api/apps';
 import { operationsApi } from '@/services/api';
+import type { BackupResourceInfo } from '@/services/api/backups';

 interface AppDetailPanelProps {
  instanceName: string;
@@ -79,7 +80,7 @@ export function AppDetailPanel({
    pod?: string;
    container?: string;
  }>({ tail: 100 });
-  const [backupResources, setBackupResources] = useState<any[]>([]);
+  const [backupResources, setBackupResources] = useState<BackupResourceInfo[]>([]);
  const [loadingBackupResources, setLoadingBackupResources] = useState(false);
  const [hasLoadedBackupResources, setHasLoadedBackupResources] = useState(false);
  const [activeTab, setActiveTab] = useState('overview');
@@ -213,7 +214,7 @@ export function AppDetailPanel({
      setLoadingBackupResources(true);
      apiClient.get(`/api/v1/instances/${instanceName}/apps/${appName}/backup/discover`)
        .then((response) => {
-          const data = response as { data?: { resources?: any[] } };
+          const data = response as { data?: { resources?: BackupResourceInfo[] } };
          // Handle both empty array and actual resources
          const resources = data.data?.resources || [];
          setBackupResources(resources);
--- a/web/src/components/backup/BackupDetailsModal.tsx
+++ b/web/src/components/backup/BackupDetailsModal.tsx
@@ -26,7 +26,7 @@ interface BackupDetailsModalProps {
  backup: RecoveryPlan | null;
  isOpen: boolean;
  onClose: () => void;
-  onRestore: (backup: RecoveryPlan) => void;
+  onRestore?: (backup: RecoveryPlan) => void;
 }

 export function BackupDetailsModal({
@@ -222,7 +222,7 @@ export function BackupDetailsModal({
        </div>

        <DialogFooter className="gap-2">
-          {backup.status === 'backed_up' && (
+          {onRestore && backup.status === 'backed_up' && (
            <Button
              onClick={() => {
                onRestore(backup);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Paul Payne	e82c92b72e	Node health monitoring.	2026-05-25 07:35:53 +00:00
Paul Payne	270fbeabef	Adds node reboot.	2026-05-25 07:26:29 +00:00
Paul Payne	fdab9484a6	feat: Add cluster config backup and move schedules to per-app backup pages Cluster config backup archives kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos node configs for disaster recovery. Appears as "Cluster Config" row on the backups page with its own detail page. Backup schedules are now shown on each app's individual backup page instead of the main backups overview, with active operations visible per-app for real-time feedback during backup/restore. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:54:46 +00:00
Paul Payne	322492a85f	fix: Resolve SSE test race condition by making client registration synchronous RegisterClient was async (channel-based), so Broadcast could be processed before the client was registered in the map, causing flaky test failures. Register directly under the mutex instead. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:54:13 +00:00
Paul Payne	3f97dce86a	docs: Update all guides to reflect current CLI, API, and web app Rewrote backup/restore guides to document current system (native pg_dump/Longhorn/tar.gz tools, blue-green restore, scheduling) and remove outdated restic references. Rewrote monitoring guide to replace K3s/Helm/Velero placeholders with actual capabilities. Filled in all four upgrade guides (Talos, Kubernetes, applications, Wild Cloud) that were previously TBD stubs. Expanded troubleshooting guides with correct namespaces, Wild Cloud CLI commands, and Talos-specific diagnostics. Added verification commands to cluster networking health checklist. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:54:11 +00:00
Paul Payne	11c875a513	fix: Resolve all golangci-lint errors across API codebase Handle unchecked errors (errcheck), fix nil-deref false positives (SA5011), suppress deprecated-but-functional API warnings (SA1019), remove unused code, and use fmt.Fprintf over WriteString(fmt.Sprintf(...)). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:52:59 +00:00
Paul Payne	e051e80601	fix: Resolve eslint errors across web UI Remove unused imports (Clock, Database) and dead code (formatUptime), replace `any` types with proper types (BackupResourceInfo, QueryClient, Record<string, unknown>), fix DeployedApp/App type incompatibility, and use const for module-level collections in SSE hook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:32:06 +00:00
Paul Payne	fd58c7b694	Linting.	2026-05-24 21:24:40 +00:00
Paul Payne	3e9aa153e2	Go format.	2026-05-24 20:54:13 +00:00
Paul Payne	7cad37db07	More logging.	2026-05-24 20:40:02 +00:00
Paul Payne	eff5246144	Add more resiliency to backups and operations. Use Longhorn CRDs instead of a janky tunnel.	2026-05-24 20:35:51 +00:00
Paul Payne	81604879dc	slog integration	2026-05-24 20:29:22 +00:00
Paul Payne	44c7cb6f72	Bakup UX.	2026-05-24 20:03:27 +00:00