Nav state.

Restore strategies.
Improve Directory pages.
2026-05-27 07:03:04 +00:00 · 2026-05-25 23:09:39 +00:00 · 2026-05-25 22:29:47 +00:00 · 2026-05-25 22:01:55 +00:00 · 2026-05-25 22:01:43 +00:00 · 2026-05-25 22:01:20 +00:00
144 changed files with 9151 additions and 3479 deletions
--- a/api/.air.toml
+++ b/api/.air.toml
@@ -0,0 +1,19 @@
+root = "."
+tmp_dir = "tmp"
+
+[build]
+  bin = "./tmp/wildd"
+  cmd = "go build -o ./tmp/wildd ."
+  delay = 1000
+  exclude_dir = ["tmp", "build", "dist", "vendor"]
+  exclude_regex = ["_test.go$"]
+  include_ext = ["go", "yaml"]
+  kill_delay = "0s"
+  send_interrupt = true
+  stop_on_error = true
+
+[log]
+  time = false
+
+[misc]
+  clean_on_exit = true
--- a/api/.gitignore
+++ b/api/.gitignore
@@ -22,3 +22,6 @@ __debug*
 # Go workspace file
 go.work
 go.work.sum
+
+# Air live-reload
+tmp/
--- a/api/Makefile
+++ b/api/Makefile
@@ -29,8 +29,14 @@ build: ## Build the daemon binary
 	$(GOBUILD) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY_NAME) .

 dev: ## Run the daemon in development mode with live reloading
-	@echo "Starting $(BINARY_NAME) in development mode..."
-	$(GOCMD) run .
+	@if command -v air >/dev/null 2>&1; then \
+		echo "Starting $(BINARY_NAME) in development mode with live reloading (air)..."; \
+		air; \
+	else \
+		echo "air not found. Install it for live reloading: go install github.com/air-verse/air@latest"; \
+		echo "Starting $(BINARY_NAME) in development mode without live reloading..."; \
+		$(GOCMD) run .; \
+	fi

 test: ## Run tests
 	@echo "Running tests..."
--- a/api/README.md
+++ b/api/README.md
@@ -4,7 +4,7 @@ The Wild Central API is a lightweight service that runs on a local machine (e.g.

 ## Development

-Start the development server:
+Start the development server with live reloading:

 ```bash
 make dev
@@ -12,6 +12,14 @@ make dev

 The API will be available at `http://localhost:5055`.

+`make dev` uses [air](https://github.com/air-verse/air) to automatically rebuild and restart the server when `.go` or `.yaml` files change. Install it with:
+
+```bash
+go install github.com/air-verse/air@latest
+```
+
+If `air` is not installed, `make dev` falls back to `go run .` (no live reloading).
+
 ### Environment Variables

 - `WILD_API_DATA_DIR` - Directory for instance data (default: `/var/lib/wild-central`)
--- a/api/internal/api/v1/async.go
+++ b/api/internal/api/v1/async.go
@@ -2,7 +2,7 @@ package v1

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"

 	"github.com/wild-cloud/wild-central/daemon/internal/operations"
@@ -38,7 +38,7 @@ func (api *API) StartAsyncOperation(
 		// Always recover from panics to prevent goroutine crashes from taking down the server
 		defer func() {
 			if r := recover(); r != nil {
-				log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
+				slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
 				_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
 			}
 		}()
@@ -71,7 +71,7 @@ func (api *API) StartAsyncOperationWithMessage(
 	go func() {
 		defer func() {
 			if r := recover(); r != nil {
-				log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
+				slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
 				_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
 			}
 		}()
@@ -105,7 +105,7 @@ func (api *API) StartAsyncOperationWithBroadcaster(
 	go func() {
 		defer func() {
 			if r := recover(); r != nil {
-				log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
+				slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
 				_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
 			}
 		}()
--- a/api/internal/api/v1/handlers.go
+++ b/api/internal/api/v1/handlers.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"
 	"time"
@@ -25,18 +25,18 @@ import (

 // API holds all dependencies for API handlers
 type API struct {
-	dataDir         string
-	appsDir         string // Path to external apps directory
-	config          *config.Manager
-	secrets         *secrets.Manager
-	context         *context.Manager
-	instance        *instance.Manager
-	dnsmasq         *dnsmasq.ConfigGenerator
-	opsMgr          *operations.Manager     // Operations manager
-	broadcaster     *operations.Broadcaster // SSE broadcaster for operation output
-	sseManager      *sse.Manager            // SSE manager for real-time events
-	watcherManager  *sse.WatcherManager     // Manager for kubectl/talos watchers
-	factory         *factory.Client         // Talos Image Factory client
+	dataDir        string
+	appsDir        string // Path to external apps directory
+	config         *config.Manager
+	secrets        *secrets.Manager
+	context        *context.Manager
+	instance       *instance.Manager
+	dnsmasq        *dnsmasq.ConfigGenerator
+	opsMgr         *operations.Manager     // Operations manager
+	broadcaster    *operations.Broadcaster // SSE broadcaster for operation output
+	sseManager     *sse.Manager            // SSE manager for real-time events
+	watcherManager *sse.WatcherManager     // Manager for kubectl/talos watchers
+	factory        *factory.Client         // Talos Image Factory client
 }

 // NewAPI creates a new API handler with all dependencies
@@ -59,7 +59,7 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
 	dnsmasqConfigPath := "/etc/dnsmasq.d/wild-cloud.conf"
 	if os.Getenv("WILD_API_DNSMASQ_CONFIG_PATH") != "" {
 		dnsmasqConfigPath = os.Getenv("WILD_API_DNSMASQ_CONFIG_PATH")
-		log.Printf("Using custom dnsmasq config path: %s", dnsmasqConfigPath)
+		slog.Info("using custom dnsmasq config path", "path", dnsmasqConfigPath)
 	}

 	// Create SSE manager for real-time events
@@ -73,24 +73,23 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
 	opsMgr.SetSSEManager(adapter)

 	api := &API{
-		dataDir:         dataDir,
-		appsDir:         appsDir,
-		config:          configMgr,
-		secrets:         secrets.NewManager(),
-		context:         context.NewManager(dataDir),
-		instance:        instance.NewManager(dataDir),
-		dnsmasq:         dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
-		opsMgr:          opsMgr,
-		broadcaster:     operations.NewBroadcaster(),
-		sseManager:      sseManager,
-		watcherManager:  watcherManager,
-		factory:         factory.NewClient(),
+		dataDir:        dataDir,
+		appsDir:        appsDir,
+		config:         configMgr,
+		secrets:        secrets.NewManager(),
+		context:        context.NewManager(dataDir),
+		instance:       instance.NewManager(dataDir),
+		dnsmasq:        dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
+		opsMgr:         opsMgr,
+		broadcaster:    operations.NewBroadcaster(),
+		sseManager:     sseManager,
+		watcherManager: watcherManager,
+		factory:        factory.NewClient(),
 	}

 	return api, nil
 }

-
 // StartCentralStatusBroadcaster starts periodic broadcasting of central status
 func (api *API) StartCentralStatusBroadcaster(startTime time.Time) {
 	go func() {
@@ -107,6 +106,8 @@ func (api *API) StartCentralStatusBroadcaster(startTime time.Time) {
 }

 func (api *API) RegisterRoutes(r *mux.Router) {
+	// Request logging middleware (runs first, wraps everything)
+	r.Use(RequestLoggingMiddleware)
 	// Apply instance validation middleware to all routes with {name} parameter
 	r.Use(api.ValidateInstanceMiddleware)

@@ -145,6 +146,8 @@ func (api *API) RegisterRoutes(r *mux.Router) {
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}", api.NodeGet).Methods("GET")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}", api.NodeUpdate).Methods("PUT")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/apply", api.NodeApply).Methods("POST")
+	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/health", api.NodeHealth).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/reboot", api.NodeReboot).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/reset", api.NodeReset).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/upgrade", api.NodeUpgrade).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/rollback", api.NodeRollback).Methods("POST")
@@ -229,6 +232,11 @@ func (api *API) RegisterRoutes(r *mux.Router) {
 	r.HandleFunc("/api/v1/instances/{name}/apps/{app}/restore/cleanup", api.BackupAppCleanup).Methods("POST")
 	r.HandleFunc("/api/v1/instances/{name}/apps/{app}/restore/plan", api.BackupAppRecoveryPlan).Methods("GET")

+	// Backup & Restore - Cluster Config
+	r.HandleFunc("/api/v1/instances/{name}/backup/cluster", api.BackupClusterStart).Methods("POST")
+	r.HandleFunc("/api/v1/instances/{name}/backup/cluster", api.BackupClusterList).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/backup/cluster/{timestamp}", api.BackupClusterDelete).Methods("DELETE")
+
 	// Backup Schedules
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules", api.BackupScheduleList).Methods("GET")
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules", api.BackupScheduleCreate).Methods("POST")
@@ -236,8 +244,10 @@ func (api *API) RegisterRoutes(r *mux.Router) {
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules/{scheduleId}", api.BackupScheduleDelete).Methods("DELETE")
 	r.HandleFunc("/api/v1/instances/{name}/backup/schedules/{scheduleId}/run", api.BackupScheduleRun).Methods("POST")

-	// Backup Health
+	// Backup Health & Configuration
 	r.HandleFunc("/api/v1/instances/{name}/backup/health", api.BackupHealth).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/backup/config", api.BackupConfigGet).Methods("GET")
+	r.HandleFunc("/api/v1/instances/{name}/backup/config", api.BackupConfigUpdate).Methods("PUT")

 	// Global Configuration
 	r.HandleFunc("/api/v1/config", api.GetGlobalConfig).Methods("GET")
@@ -299,7 +309,7 @@ func (api *API) CreateInstance(w http.ResponseWriter, r *http.Request) {
 	}

 	if err := api.updateDnsmasqForAllInstances(); err != nil {
-		log.Printf("Warning: Could not update dnsmasq configuration: %v", err)
+		slog.Error("dnsmasq config update failed", "instance", req.Name, "error", err)
 		response["warning"] = fmt.Sprintf("dnsmasq update failed: %v. Use POST /api/v1/dnsmasq/update to retry.", err)
 	}

@@ -387,7 +397,7 @@ func (api *API) GetConfig(w http.ResponseWriter, r *http.Request) {
 		// Return raw YAML
 		w.Header().Set("Content-Type", "application/yaml")
 		w.WriteHeader(http.StatusOK)
-		w.Write(configData)
+		_, _ = w.Write(configData)
 		return
 	}

--- a/api/internal/api/v1/handlers_apps.go
+++ b/api/internal/api/v1/handlers_apps.go
@@ -385,7 +385,7 @@ func (api *API) AppsGetReadme(w http.ResponseWriter, r *http.Request) {
 	content, err := os.ReadFile(instancePath)
 	if err == nil {
 		w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
-		w.Write(content)
+		_, _ = w.Write(content)
 		return
 	}

@@ -402,7 +402,7 @@ func (api *API) AppsGetReadme(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
-	w.Write(content)
+	_, _ = w.Write(content)
 }

 // AppsGetManifest returns the manifest for an available app
@@ -428,8 +428,8 @@ func (api *API) AppsGetAvailableReadme(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	readmePath := filepath.Join(api.appsDir, appName, "README.md")
-	content, err := os.ReadFile(readmePath)
+	appsMgr := apps.NewManager(api.dataDir, api.appsDir)
+	content, err := appsMgr.GetCatalogReadme(appName)
 	if err != nil {
 		if os.IsNotExist(err) {
 			respondError(w, http.StatusNotFound, fmt.Sprintf("README not found for app '%s'", appName))
@@ -440,7 +440,7 @@ func (api *API) AppsGetAvailableReadme(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
-	w.Write(content)
+	_, _ = w.Write(content)
 }

 // AppsCompile recompiles an app's templates
@@ -487,7 +487,7 @@ func (api *API) AppsGetManifests(w http.ResponseWriter, r *http.Request) {
 	var manifest apps.AppManifest
 	manifestPath := filepath.Join(appDir, "manifest.yaml")
 	if data, err := os.ReadFile(manifestPath); err == nil {
-		yaml.Unmarshal(data, &manifest)
+		_ = yaml.Unmarshal(data, &manifest)
 	}

 	// Build list of kustomize directories to render
@@ -523,7 +523,7 @@ func (api *API) AppsGetManifests(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
-	w.Write(allOutput)
+	_, _ = w.Write(allOutput)
 }

 // AppsRunScript runs a named script defined in the app's manifest
--- a/api/internal/api/v1/handlers_backup.go
+++ b/api/internal/api/v1/handlers_backup.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
@@ -370,7 +371,7 @@ func (api *API) BackupAppDelete(w http.ResponseWriter, r *http.Request) {
 				"app": appName,
 			},
 		})
-		respondError(w, http.StatusInternalServerError, "Failed to delete backup")
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete backup: %v", err))
 		return
 	}

@@ -454,12 +455,12 @@ func (api *API) BackupAppVerify(w http.ResponseWriter, r *http.Request) {

 // BackupResourceInfo contains information about a discovered backup resource
 type BackupResourceInfo struct {
-	Name         string                 `json:"name"`
-	Type         string                 `json:"type"`  // "database", "pvc", "secret"
-	Plugin       string                 `json:"plugin"` // "postgres", "mysql", "longhorn-pvc", etc.
+	Name         string         `json:"name"`
+	Type         string         `json:"type"`   // "database", "pvc", "secret"
+	Plugin       string         `json:"plugin"` // "postgres", "mysql", "longhorn-pvc", etc.
 	Source       map[string]any `json:"source"` // Resource-specific info
-	ShouldBackup bool                   `json:"shouldBackup"`
-	Reason       string                 `json:"reason,omitempty"` // Why it's included/excluded
+	ShouldBackup bool           `json:"shouldBackup"`
+	Reason       string         `json:"reason,omitempty"` // Why it's included/excluded
 }

 // BackupAppDiscoverResources auto-discovers backup resources for an app
@@ -590,9 +591,9 @@ func parsePVC(pvc map[string]any) BackupResourceInfo {
 	}

 	return BackupResourceInfo{
-		Name:         name,
-		Type:         "pvc",
-		Plugin:       plugin,
+		Name:   name,
+		Type:   "pvc",
+		Plugin: plugin,
 		Source: map[string]any{
 			"pvcName":      name,
 			"storageClass": storageClass,
@@ -635,9 +636,9 @@ func parseVolumeClaimTemplate(vct map[string]any, statefulSetName string) Backup
 	}

 	return BackupResourceInfo{
-		Name:         pvcName,
-		Type:         "pvc",
-		Plugin:       detectStoragePlugin(storageClass),
+		Name:   pvcName,
+		Type:   "pvc",
+		Plugin: detectStoragePlugin(storageClass),
 		Source: map[string]any{
 			"pvcName":      pvcName,
 			"storageClass": storageClass,
@@ -684,7 +685,7 @@ func discoverDatabases(dataDir, instanceName, appName, manifestPath string) []Ba
 		configPath := tools.GetInstanceConfigPath(dataDir, instanceName)
 		configData, _ := os.ReadFile(configPath)
 		var config map[string]any
-		yaml.Unmarshal(configData, &config)
+		_ = yaml.Unmarshal(configData, &config)

 		appConfig := map[string]any{}
 		if apps, ok := config["apps"].(map[string]any); ok {
@@ -998,13 +999,18 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	if sched.TargetType != "app" {
-		respondError(w, http.StatusBadRequest, "Only app schedules can be triggered manually")
+	if sched.TargetType != "app" && sched.TargetType != "cluster" {
+		respondError(w, http.StatusBadRequest, "Unsupported schedule target type")
 		return
 	}

+	opTarget := sched.TargetName
+	if sched.TargetType == "cluster" {
+		opTarget = "_cluster"
+	}
+
 	// Run as async operation
-	api.StartAsyncOperation(w, instanceName, "backup", sched.TargetName,
+	api.StartAsyncOperation(w, instanceName, "backup", opTarget,
 		func(opsMgr *operations.Manager, opID string) error {
 			_ = opsMgr.UpdateProgress(instanceName, opID, 10, "Starting scheduled backup")

@@ -1013,7 +1019,13 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
 			}

 			mgr := backup.NewManagerWithProgress(api.dataDir, progressCallback)
-			_, err := mgr.BackupApp(instanceName, sched.TargetName)
+
+			var err error
+			if sched.TargetType == "cluster" {
+				_, err = mgr.BackupClusterConfig(instanceName)
+			} else {
+				_, err = mgr.BackupApp(instanceName, sched.TargetName)
+			}

 			if err == nil {
 				// Update lastRun and nextRun
@@ -1021,26 +1033,28 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
 				sched.LastRun = &now
 				next := backup.ComputeNextRun(sched, now)
 				sched.NextRun = &next
-				backup.SaveInstanceBackupSchedules(api.dataDir, instanceName, config.Schedules)
+				if err := backup.SaveInstanceBackupSchedules(api.dataDir, instanceName, config.Schedules); err != nil {
+					slog.Error("failed to save backup schedules", "instance", instanceName, "error", err)
+				}

 				api.sseManager.Broadcast(&sse.Event{
 					Type:         "backup:schedule:completed",
 					InstanceName: instanceName,
 					Data: map[string]any{
 						"scheduleId": scheduleID,
-						"app":        sched.TargetName,
+						"target":     opTarget,
 					},
 				})

-				// Enforce retention using schedule's policy
+				// Enforce retention
 				keepLast, keepDays := backup.RetentionFromSchedule(sched, config.Retention)
-				deleted, retErr := backup.EnforceRetention(mgr, instanceName, sched.TargetName, keepLast, keepDays)
+				deleted, retErr := backup.EnforceRetention(mgr, instanceName, opTarget, keepLast, keepDays)
 				if retErr == nil && deleted > 0 {
 					api.sseManager.Broadcast(&sse.Event{
 						Type:         "backup:retention:completed",
 						InstanceName: instanceName,
 						Data: map[string]any{
-							"app":     sched.TargetName,
+							"target":  opTarget,
 							"deleted": deleted,
 						},
 					})
@@ -1063,6 +1077,9 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {

 	mgr := backup.NewManager(api.dataDir)

+	// Compute default retention limit
+	defaultKeepLast, _ := backup.DefaultRetention(config.Retention)
+
 	// Get all apps with backups by scanning the backup directory
 	backupDir := mgr.GetBackupDir(instanceName)
 	appHealth := make(map[string]any)
@@ -1079,8 +1096,38 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
 			continue
 		}

+		// Compute total size across all backups for this app
+		var totalSize int64
+		for _, p := range plans {
+			for _, s := range p.Strategies {
+				if s.Backup != nil {
+					if size, ok := s.Backup["size"]; ok {
+						switch v := size.(type) {
+						case int64:
+							totalSize += v
+						case int:
+							totalSize += int64(v)
+						case float64:
+							totalSize += int64(v)
+						}
+					}
+				}
+			}
+		}
+
+		// Determine retention limit for this app (schedule override or default)
+		keepLast := defaultKeepLast
+		for _, sched := range config.Schedules {
+			if sched.TargetName == appName && sched.Enabled && sched.Retention != nil && sched.Retention.KeepLast > 0 {
+				keepLast = sched.Retention.KeepLast
+				break
+			}
+		}
+
 		info := map[string]any{
 			"backupCount": len(plans),
+			"retainCount": keepLast,
+			"totalSize":   totalSize,
 			"scheduled":   false,
 		}

@@ -1088,6 +1135,9 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
 			newest := plans[0]
 			info["lastBackup"] = newest.Timestamp
 			info["lastStatus"] = newest.Status
+			if newest.Version != "" {
+				info["lastVersion"] = newest.Version
+			}
 		}

 		// Check if this app has an active schedule
@@ -1135,3 +1185,180 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
 	})
 }

+// BackupConfigGet returns the current backup configuration (destination + retention)
+func (api *API) BackupConfigGet(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	config, err := backup.LoadInstanceBackupConfig(api.dataDir, instanceName)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "Failed to load backup config")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"data": map[string]any{
+			"destination":  config.Destination,
+			"retention":    config.Retention,
+			"verification": config.Verification,
+		},
+	})
+}
+
+// BackupConfigUpdate updates the backup destination and/or retention settings
+func (api *API) BackupConfigUpdate(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	var req struct {
+		Destination *backup.DestinationConfig `json:"destination"`
+		Retention   *backup.RetentionPolicy   `json:"retention"`
+	}
+
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		respondError(w, http.StatusBadRequest, "Invalid request body")
+		return
+	}
+
+	if req.Destination == nil && req.Retention == nil {
+		respondError(w, http.StatusBadRequest, "Must provide destination or retention to update")
+		return
+	}
+
+	// Validate destination type if provided
+	if req.Destination != nil {
+		switch req.Destination.Type {
+		case "local", "nfs", "s3", "azure":
+			// valid
+		default:
+			respondError(w, http.StatusBadRequest, fmt.Sprintf("Invalid destination type: %s", req.Destination.Type))
+			return
+		}
+	}
+
+	if err := backup.SaveInstanceBackupConfig(api.dataDir, instanceName, req.Destination, req.Retention); err != nil {
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to save backup config: %v", err))
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"message": "Backup configuration updated",
+	})
+}
+
+// BackupClusterStart starts a cluster config backup operation
+func (api *API) BackupClusterStart(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	api.sseManager.Broadcast(&sse.Event{
+		Type:         "backup:started",
+		InstanceName: instanceName,
+		Data: map[string]any{
+			"app": "_cluster",
+		},
+	})
+
+	api.StartAsyncOperation(w, instanceName, "backup", "_cluster",
+		func(opsMgr *operations.Manager, opID string) error {
+			_ = opsMgr.UpdateProgress(instanceName, opID, 10, "Starting cluster config backup")
+
+			progressCallback := func(progress int, message string) {
+				_ = opsMgr.UpdateProgress(instanceName, opID, progress, message)
+			}
+
+			mgr := backup.NewManagerWithProgress(api.dataDir, progressCallback)
+			_, err := mgr.BackupClusterConfig(instanceName)
+
+			if err != nil {
+				api.sseManager.Broadcast(&sse.Event{
+					Type:         "backup:failed",
+					InstanceName: instanceName,
+					Data: map[string]any{
+						"app":   "_cluster",
+						"error": err.Error(),
+					},
+				})
+			} else {
+				api.sseManager.Broadcast(&sse.Event{
+					Type:         "backup:completed",
+					InstanceName: instanceName,
+					Data: map[string]any{
+						"app": "_cluster",
+					},
+				})
+
+				// Enforce retention after successful backup
+				config, configErr := backup.LoadInstanceBackupConfig(api.dataDir, instanceName)
+				if configErr == nil {
+					keepLast, keepDays := backup.DefaultRetention(config.Retention)
+					deleted, retErr := backup.EnforceRetention(mgr, instanceName, "_cluster", keepLast, keepDays)
+					if retErr == nil && deleted > 0 {
+						api.sseManager.Broadcast(&sse.Event{
+							Type:         "backup:retention:completed",
+							InstanceName: instanceName,
+							Data: map[string]any{
+								"target":  "_cluster",
+								"deleted": deleted,
+							},
+						})
+					}
+				}
+			}
+
+			return err
+		})
+}
+
+// BackupClusterList lists all cluster config backups
+func (api *API) BackupClusterList(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+
+	mgr := backup.NewManager(api.dataDir)
+	backups, err := mgr.ListBackups(instanceName, "_cluster")
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "Failed to list cluster backups")
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"data": map[string]any{
+			"backups": backups,
+		},
+	})
+}
+
+// BackupClusterDelete deletes a specific cluster config backup
+func (api *API) BackupClusterDelete(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+	timestamp := mux.Vars(r)["timestamp"]
+
+	mgr := backup.NewManager(api.dataDir)
+	if err := mgr.DeleteAppBackup(instanceName, "_cluster", timestamp); err != nil {
+		api.sseManager.Broadcast(&sse.Event{
+			Type:         "backup:delete:failed",
+			InstanceName: instanceName,
+			Data: map[string]any{
+				"app":       "_cluster",
+				"timestamp": timestamp,
+				"error":     err.Error(),
+			},
+		})
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete backup: %v", err))
+		return
+	}
+
+	api.sseManager.Broadcast(&sse.Event{
+		Type:         "backup:deleted",
+		InstanceName: instanceName,
+		Data: map[string]any{
+			"app":       "_cluster",
+			"timestamp": timestamp,
+		},
+	})
+
+	respondJSON(w, http.StatusOK, map[string]any{
+		"success": true,
+		"message": "Cluster config backup deleted successfully",
+	})
+}
--- a/api/internal/api/v1/handlers_backup_test.go
+++ b/api/internal/api/v1/handlers_backup_test.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"path/filepath"
 	"testing"
+	"time"

 	"github.com/gorilla/mux"
 	"github.com/stretchr/testify/assert"
@@ -740,10 +741,10 @@ func TestIsDatabase(t *testing.T) {
 // It verifies that we can find PVCs and StatefulSet volume claims in Kubernetes manifests
 func TestDiscoverFromKustomize(t *testing.T) {
 	tests := []struct {
-		name           string
-		kustomizeYAML  string
-		expectedCount  int
-		expectedFirst  BackupResourceInfo
+		name          string
+		kustomizeYAML string
+		expectedCount int
+		expectedFirst BackupResourceInfo
 	}{
 		{
 			name: "Discovers PVC as persistent state",
@@ -954,6 +955,23 @@ func splitYAMLDocuments(content string) []string {
 	return docs
 }

+// waitForAsyncOp polls an operation until it reaches a terminal state.
+// Prevents t.TempDir() cleanup from racing with async goroutines that write operation files.
+func waitForAsyncOp(t *testing.T, opsMgr *operations.Manager, instanceName string, response map[string]interface{}) {
+	t.Helper()
+	opID, ok := response["operation_id"].(string)
+	if !ok || opID == "" {
+		return
+	}
+	for range 100 {
+		op, err := opsMgr.GetByInstance(instanceName, opID)
+		if err == nil && (op.Status == "completed" || op.Status == "failed" || op.Status == "cancelled") {
+			return
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+}
+
 // TestBackupAppOperations tests actual backup operations (start, list, delete)
 // These are different from discovery - they perform backup actions
 func TestBackupAppOperations(t *testing.T) {
@@ -1011,6 +1029,8 @@ func TestBackupAppOperations(t *testing.T) {
 		assert.Contains(t, response, "operation_id")
 		assert.Contains(t, response, "message")
 		assert.Equal(t, "backup initiated", response["message"])
+
+		waitForAsyncOp(t, api.opsMgr, "test-instance", response)
 	})

 	t.Run("BackupAppList", func(t *testing.T) {
@@ -1058,6 +1078,8 @@ func TestBackupAppOperations(t *testing.T) {
 		assert.Contains(t, response, "operation_id")
 		assert.Contains(t, response, "message")
 		assert.Equal(t, "restore initiated", response["message"])
+
+		waitForAsyncOp(t, api.opsMgr, "test-instance", response)
 	})

 	t.Run("BackupAppRestore with options", func(t *testing.T) {
@@ -1087,6 +1109,8 @@ func TestBackupAppOperations(t *testing.T) {
 		assert.Contains(t, response, "operation_id")
 		assert.Contains(t, response, "message")
 		assert.Equal(t, "restore initiated", response["message"])
+
+		waitForAsyncOp(t, api.opsMgr, "test-instance", response)
 	})

 	t.Run("BackupAppDelete", func(t *testing.T) {
@@ -1115,4 +1139,4 @@ func TestBackupAppOperations(t *testing.T) {
 			assert.Equal(t, "Backup deleted successfully", response["message"])
 		}
 	})
-}
+}
--- a/api/internal/api/v1/handlers_config.go
+++ b/api/internal/api/v1/handlers_config.go
@@ -3,6 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net/http"

 	"github.com/wild-cloud/wild-central/daemon/internal/config"
@@ -43,6 +44,8 @@ func (api *API) ConfigUpdateBatch(w http.ResponseWriter, r *http.Request) {
 		updateCount++
 	}

+	slog.Info("config batch updated", "instance", instanceName, "keys", updateCount)
+
 	respondJSON(w, http.StatusOK, map[string]interface{}{
 		"message": "Configuration updated successfully",
 		"updated": updateCount,
@@ -87,6 +90,8 @@ func (api *API) UpdateGlobalConfig(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	slog.Info("global config updated")
+
 	respondJSON(w, http.StatusOK, map[string]interface{}{
 		"message": "Global configuration updated successfully",
 		"config":  globalCfg,
--- a/api/internal/api/v1/handlers_dnsmasq.go
+++ b/api/internal/api/v1/handlers_dnsmasq.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"

@@ -79,7 +79,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		instanceConfigPath := api.instance.GetInstanceConfigPath(name)
 		instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
 		if err != nil {
-			log.Printf("Warning: Could not load instance config for %s: %v", name, err)
+			slog.Error("failed to load instance config", "instance", name, "error", err)
 			continue
 		}
 		instanceConfigs = append(instanceConfigs, *instanceCfg)
@@ -95,7 +95,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		isFirstStart := err != nil || status.Status != "active"

 		// Update main dnsmasq configuration
-		log.Printf("Updating dnsmasq main configuration...")
+		slog.Info("updating dnsmasq main configuration")

 		// Write the main config
 		tempFile := api.dnsmasq.GetConfigPath() + ".tmp"
@@ -121,7 +121,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		// Write all instance configs
 		for i, name := range validInstanceNames {
 			if err := api.dnsmasq.WriteInstanceConfig(name, instanceConfigs[i]); err != nil {
-				log.Printf("Warning: Failed to write instance config for %s: %v", name, err)
+				slog.Error("failed to write instance DNS config", "instance", name, "error", err)
 			}
 		}

@@ -134,7 +134,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
 		// Configure system DNS to use local dnsmasq on first start
 		if isFirstStart {
 			if err := api.dnsmasq.ConfigureSystemDNS(); err != nil {
-				log.Printf("Warning: Failed to configure system DNS: %v", err)
+				slog.Error("failed to configure system DNS", "error", err)
 				// Don't fail the request - dnsmasq is still running
 			}
 		}
@@ -211,16 +211,14 @@ func (api *API) updateDnsmasqForAllInstances() error {

 	// Load all instance configs
 	var instanceConfigs []config.InstanceConfig
-	var validInstanceNames []string
 	for _, name := range instanceNames {
 		instanceConfigPath := api.instance.GetInstanceConfigPath(name)
 		instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
 		if err != nil {
-			log.Printf("Warning: Could not load instance config for %s: %v", name, err)
+			slog.Error("failed to load instance config", "instance", name, "error", err)
 			continue
 		}
 		instanceConfigs = append(instanceConfigs, *instanceCfg)
-		validInstanceNames = append(validInstanceNames, name)
 	}

 	// Regenerate and write dnsmasq config with restart
--- a/api/internal/api/v1/handlers_dnsmasq_test.go
+++ b/api/internal/api/v1/handlers_dnsmasq_test.go
@@ -42,7 +42,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
 	globalConfig.Cloud.Router.IP = "192.168.1.1"
 	configPath := filepath.Join(tmpDir, "config.yaml")
 	configData, _ := yaml.Marshal(globalConfig)
-	storage.WriteFile(configPath, configData, 0644)
+	if err := storage.WriteFile(configPath, configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create test instance
 	instanceName := "test-instance"
@@ -54,7 +56,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
 	instanceConfig.Cloud.InternalDomain = "internal.test.local"
 	instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
 	instanceConfigData, _ := yaml.Marshal(instanceConfig)
-	storage.WriteFile(instanceConfigPath, instanceConfigData, 0644)
+	if err := storage.WriteFile(instanceConfigPath, instanceConfigData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Test generate without overwrite
 	req := httptest.NewRequest("POST", "/api/v1/dnsmasq/generate", nil)
@@ -67,7 +71,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
 	}

 	var resp map[string]interface{}
-	json.Unmarshal(w.Body.Bytes(), &resp)
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to unmarshal response: %v", err)
+	}

 	// Verify response contains config
 	if config, ok := resp["config"].(string); !ok || config == "" {
@@ -90,7 +96,9 @@ func TestDnsmasqGenerate_WithOverwrite(t *testing.T) {
 	globalConfig.Cloud.Router.IP = "192.168.1.1"
 	configPath := filepath.Join(tmpDir, "config.yaml")
 	configData, _ := yaml.Marshal(globalConfig)
-	storage.WriteFile(configPath, configData, 0644)
+	if err := storage.WriteFile(configPath, configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create test instance
 	instanceName := "test-instance"
@@ -103,7 +111,9 @@ func TestDnsmasqGenerate_WithOverwrite(t *testing.T) {
 	instanceConfig.Cluster.LoadBalancerIp = "192.168.1.80"
 	instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
 	instanceConfigData, _ := yaml.Marshal(instanceConfig)
-	storage.WriteFile(instanceConfigPath, instanceConfigData, 0644)
+	if err := storage.WriteFile(instanceConfigPath, instanceConfigData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Instead of calling the handler which would try to restart the service,
 	// directly test the UpdateConfig method with restart=false
@@ -201,8 +211,12 @@ func TestDnsmasqGetConfig(t *testing.T) {
 	// Write a config first
 	configPath := api.dnsmasq.GetConfigPath()
 	testConfig := "# Test config\ninterface=eth0\n"
-	os.MkdirAll(filepath.Dir(configPath), 0755)
-	os.WriteFile(configPath, []byte(testConfig), 0644)
+	if err := os.MkdirAll(filepath.Dir(configPath), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(configPath, []byte(testConfig), 0644); err != nil {
+		t.Fatal(err)
+	}

 	req := httptest.NewRequest("GET", "/api/v1/dnsmasq/config", nil)
 	w := httptest.NewRecorder()
@@ -214,7 +228,9 @@ func TestDnsmasqGetConfig(t *testing.T) {
 	}

 	var resp map[string]interface{}
-	json.Unmarshal(w.Body.Bytes(), &resp)
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("failed to unmarshal response: %v", err)
+	}

 	content, ok := resp["content"].(string)
 	if !ok || content != testConfig {
--- a/api/internal/api/v1/handlers_node.go
+++ b/api/internal/api/v1/handlers_node.go
@@ -55,7 +55,7 @@ func (api *API) NodeDiscover(w http.ResponseWriter, r *http.Request) {
 		}
 	}

-	discoveryMgr := discovery.NewManager(api.dataDir, instanceName)
+	discoveryMgr := discovery.NewManager(api.dataDir, instanceName, api.sseManager)
 	if err := discoveryMgr.StartDiscovery(instanceName, ipList); err != nil {
 		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to start discovery: %v", err))
 		return
@@ -72,7 +72,7 @@ func (api *API) NodeDiscover(w http.ResponseWriter, r *http.Request) {
 func (api *API) NodeDiscoveryStatus(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)

-	discoveryMgr := discovery.NewManager(api.dataDir, instanceName)
+	discoveryMgr := discovery.NewManager(api.dataDir, instanceName, api.sseManager)
 	status, err := discoveryMgr.GetDiscoveryStatus(instanceName)
 	if err != nil {
 		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to get status: %v", err))
@@ -133,12 +133,25 @@ func (api *API) NodeAdd(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	if nodeData.Version != "" {
+		if err := validateTalosVersion(nodeData.Version); err != nil {
+			respondError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+	}
+
 	nodeMgr := node.NewManager(api.dataDir, instanceName)
 	if err := nodeMgr.Add(instanceName, &nodeData); err != nil {
 		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to add node: %v", err))
 		return
 	}

+	// Remove from discovery results so UI no longer shows it as pending
+	if nodeData.CurrentIP != "" {
+		discoveryMgr := discovery.NewManager(api.dataDir, instanceName, api.sseManager)
+		discoveryMgr.RemoveDiscoveredNode(instanceName, nodeData.CurrentIP)
+	}
+
 	respondJSON(w, http.StatusCreated, map[string]interface{}{
 		"message": "Node added successfully",
 		"node":    nodeData,
@@ -298,7 +311,7 @@ func (api *API) NodeDelete(w http.ResponseWriter, r *http.Request) {
 func (api *API) NodeDiscoveryCancel(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)

-	discoveryMgr := discovery.NewManager(api.dataDir, instanceName)
+	discoveryMgr := discovery.NewManager(api.dataDir, instanceName, api.sseManager)
 	if err := discoveryMgr.CancelDiscovery(instanceName); err != nil {
 		respondError(w, http.StatusBadRequest, fmt.Sprintf("Failed to cancel discovery: %v", err))
 		return
@@ -309,6 +322,38 @@ func (api *API) NodeDiscoveryCancel(w http.ResponseWriter, r *http.Request) {
 	})
 }

+// NodeReboot reboots a node without wiping state
+func (api *API) NodeReboot(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+	nodeIdentifier := GetNodeName(r)
+
+	nodeMgr := node.NewManager(api.dataDir, instanceName)
+	if err := nodeMgr.Reboot(instanceName, nodeIdentifier); err != nil {
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to reboot node: %v", err))
+		return
+	}
+
+	respondJSON(w, http.StatusOK, map[string]string{
+		"message": "Node reboot initiated",
+		"node":    nodeIdentifier,
+	})
+}
+
+// NodeHealth checks node health via Talos service statuses and dmesg
+func (api *API) NodeHealth(w http.ResponseWriter, r *http.Request) {
+	instanceName := GetInstanceName(r)
+	nodeIdentifier := GetNodeName(r)
+
+	nodeMgr := node.NewManager(api.dataDir, instanceName)
+	health, err := nodeMgr.Health(instanceName, nodeIdentifier)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to check node health: %v", err))
+		return
+	}
+
+	respondJSON(w, http.StatusOK, health)
+}
+
 // NodeReset resets a node to maintenance mode
 func (api *API) NodeReset(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)
--- a/api/internal/api/v1/handlers_node_upgrade.go
+++ b/api/internal/api/v1/handlers_node_upgrade.go
@@ -28,6 +28,11 @@ func (api *API) NodeUpgrade(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	if err := validateTalosVersion(req.Version); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
 	// Get node to verify state and retrieve IP/schematic
 	nodeMgr := node.NewManager(api.dataDir, instanceName)
 	nodeData, err := nodeMgr.Get(instanceName, nodeIdentifier)
--- a/api/internal/api/v1/handlers_pxe.go
+++ b/api/internal/api/v1/handlers_pxe.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"

 	"github.com/gorilla/mux"
@@ -18,7 +18,7 @@ func (api *API) PXEListAssets(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use GET /api/v1/assets/{schematicId} instead.")
-	log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets called", instanceName)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets", "instance", instanceName)

 	// Get schematic ID from instance config
 	configPath := api.instance.GetInstanceConfigPath(instanceName)
@@ -49,7 +49,7 @@ func (api *API) PXEDownloadAsset(w http.ResponseWriter, r *http.Request) {
 	instanceName := GetInstanceName(r)

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use POST /api/v1/assets/{schematicId}/download instead.")
-	log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets/download called", instanceName)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/download", "instance", instanceName)

 	// Parse request
 	var req struct {
@@ -123,7 +123,7 @@ func (api *API) PXEGetAsset(w http.ResponseWriter, r *http.Request) {
 	assetType := mux.Vars(r)["type"]

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use GET /api/v1/assets/{schematicId}/pxe/{assetType} instead.")
-	log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets/%s called", instanceName, assetType)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/get", "instance", instanceName, "assetType", assetType)

 	// Get schematic ID from instance config
 	configPath := api.instance.GetInstanceConfigPath(instanceName)
@@ -162,7 +162,7 @@ func (api *API) PXEDeleteAsset(w http.ResponseWriter, r *http.Request) {
 	assetType := mux.Vars(r)["type"]

 	w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use DELETE /api/v1/assets/{schematicId} instead.")
-	log.Printf("Warning: Deprecated endpoint DELETE /api/v1/instances/%s/pxe/assets/%s called", instanceName, assetType)
+	slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/delete", "instance", instanceName, "assetType", assetType)

 	// Get schematic ID from instance config
 	configPath := api.instance.GetInstanceConfigPath(instanceName)
--- a/api/internal/api/v1/handlers_schematic.go
+++ b/api/internal/api/v1/handlers_schematic.go
@@ -65,6 +65,11 @@ func (api *API) SchematicUpdateInstanceSchematic(w http.ResponseWriter, r *http.
 		return
 	}

+	if err := validateTalosVersion(req.Version); err != nil {
+		respondError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+
 	configPath := api.instance.GetInstanceConfigPath(instanceName)

 	// Update schematic ID in config
--- a/api/internal/api/v1/handlers_sse.go
+++ b/api/internal/api/v1/handlers_sse.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"strings"
 	"time"
@@ -52,13 +52,12 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 	if err != nil {
 		// Default to empty string if not found - talos events will be skipped
 		nodeIP = ""
-		log.Printf("Control plane VIP not found for instance %s, Talos events will be disabled", instanceName)
+		slog.Info("control plane VIP not found, Talos events disabled", "instance", instanceName)
 	}

 	// Start watchers for this instance if not already running
 	if err := api.watcherManager.StartWatchers(instanceName, kubeconfigPath, talosconfigPath, nodeIP); err != nil {
-		log.Printf("Failed to start watchers for instance %s: %v", instanceName, err)
-		// Continue anyway - client might still receive events from other sources
+		slog.Error("failed to start watchers", "instance", instanceName, "error", err)
 	}

 	// 7. Send initial connected event
@@ -71,7 +70,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 		},
 	}
 	if err := sendSSEEvent(w, connectedEvent); err != nil {
-		log.Printf("Failed to send connected event: %v", err)
+		slog.Error("failed to send SSE connected event", "instance", instanceName, "error", err)
 		return
 	}

@@ -98,7 +97,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 		case event := <-client.Channel:
 			// Send event to client
 			if err := sendSSEEvent(w, event); err != nil {
-				log.Printf("Failed to send event: %v", err)
+				slog.Error("failed to send SSE event", "instance", instanceName, "error", err)
 				return
 			}

@@ -117,7 +116,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
 				},
 			}
 			if err := sendSSEEvent(w, heartbeatEvent); err != nil {
-				log.Printf("Failed to send heartbeat: %v", err)
+				slog.Error("failed to send SSE heartbeat", "instance", instanceName, "error", err)
 				return
 			}

@@ -190,7 +189,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
 		},
 	}
 	if err := sendSSEEvent(w, connectedEvent); err != nil {
-		log.Printf("Failed to send connected event: %v", err)
+		slog.Error("failed to send SSE connected event", "stream", "global", "error", err)
 		return
 	}

@@ -217,7 +216,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
 		case event := <-client.Channel:
 			// Send event to client
 			if err := sendSSEEvent(w, event); err != nil {
-				log.Printf("Failed to send event: %v", err)
+				slog.Error("failed to send SSE event", "stream", "global", "error", err)
 				return
 			}

@@ -236,7 +235,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
 				},
 			}
 			if err := sendSSEEvent(w, heartbeatEvent); err != nil {
-				log.Printf("Failed to send heartbeat: %v", err)
+				slog.Error("failed to send SSE heartbeat", "stream", "global", "error", err)
 				return
 			}

@@ -263,4 +262,4 @@ func parseQueryList(param string) []string {
 		}
 	}
 	return result
-}
+}
--- a/api/internal/api/v1/handlers_terminal_ws.go
+++ b/api/internal/api/v1/handlers_terminal_ws.go
@@ -63,11 +63,11 @@ func (api *API) TerminalWebSocket(w http.ResponseWriter, r *http.Request) {

 	ptmx, err := pty.Start(cmd)
 	if err != nil {
-		conn.WriteMessage(websocket.TextMessage, []byte("Failed to start shell: "+err.Error()))
+		_ = conn.WriteMessage(websocket.TextMessage, []byte("Failed to start shell: "+err.Error()))
 		return
 	}
 	defer ptmx.Close()
-	defer cmd.Process.Kill()
+	defer func() { _ = cmd.Process.Kill() }()

 	// Channel to signal when to stop
 	done := make(chan struct{})
@@ -103,7 +103,7 @@ func (api *API) TerminalWebSocket(w http.ResponseWriter, r *http.Request) {
 		var resize terminalResize
 		if err := json.Unmarshal(msg, &resize); err == nil && resize.Type == "resize" {
 			if resize.Cols > 0 && resize.Rows > 0 {
-				pty.Setsize(ptmx, &pty.Winsize{
+				_ = pty.Setsize(ptmx, &pty.Winsize{
 					Cols: uint16(resize.Cols),
 					Rows: uint16(resize.Rows),
 				})
--- a/api/internal/api/v1/helpers.go
+++ b/api/internal/api/v1/helpers.go
@@ -3,7 +3,7 @@ package v1
 import (
 	"fmt"
 	"io"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"
 	"strings"
@@ -15,28 +15,6 @@ import (
 	"gopkg.in/yaml.v3"
 )

-// getNestedValue retrieves a value from a nested map using dot notation path.
-// For example, getNestedValue(data, "cluster.nodes.active") returns data["cluster"]["nodes"]["active"].
-func getNestedValue(data map[string]interface{}, path string) interface{} {
-	keys := strings.Split(path, ".")
-	current := data
-
-	for i, key := range keys {
-		if i == len(keys)-1 {
-			return current[key]
-		}
-
-		if next, ok := current[key].(map[string]interface{}); ok {
-			current = next
-		} else {
-			return nil
-		}
-	}
-
-	return nil
-}
-
-
 // updateYAMLFile updates a YAML file with the provided key-value pairs.
 // It performs a shallow merge at the top level, preserving unmodified keys.
 func (api *API) updateYAMLFile(w http.ResponseWriter, r *http.Request, instanceName, fileType string) {
@@ -119,26 +97,26 @@ func (api *API) updateYAMLFile(w http.ResponseWriter, r *http.Request, instanceN
 		return
 	}

+	slog.Info(fileType+" updated", "instance", instanceName)
+
 	// Update DNS if domains changed
 	if domainsChanged && fileType == "config" {
 		go func() {
-			log.Printf("Domain change detected for instance %s, updating DNS configuration...", instanceName)
+			slog.Info("domain change detected, updating DNS", "instance", instanceName)

-			// Load the full instance config
 			instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
 			instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
 			if err != nil {
-				log.Printf("Failed to load instance config for DNS update: %v", err)
+				slog.Error("failed to load instance config for DNS update", "instance", instanceName, "error", err)
 				return
 			}

-			// Update the DNS configuration for this instance
 			if err := api.dnsmasq.UpdateInstanceDNS(instanceName, *instanceCfg); err != nil {
-				log.Printf("Failed to update DNS for instance %s: %v", instanceName, err)
+				slog.Error("failed to update DNS", "instance", instanceName, "error", err)
 				return
 			}

-			log.Printf("Successfully updated DNS configuration for instance %s", instanceName)
+			slog.Info("DNS configuration updated", "instance", instanceName)
 		}()
 	}

--- a/api/internal/api/v1/middleware.go
+++ b/api/internal/api/v1/middleware.go
@@ -2,11 +2,68 @@ package v1

 import (
 	"context"
+	"log/slog"
 	"net/http"
+	"strings"
+	"time"

 	"github.com/gorilla/mux"
 )

+// statusResponseWriter wraps http.ResponseWriter to capture the status code.
+type statusResponseWriter struct {
+	http.ResponseWriter
+	status int
+}
+
+func (w *statusResponseWriter) WriteHeader(code int) {
+	w.status = code
+	w.ResponseWriter.WriteHeader(code)
+}
+
+// RequestLoggingMiddleware logs method, path, status, and duration for each request.
+// Long-lived connections (SSE, WebSocket) are excluded.
+func RequestLoggingMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		path := r.URL.Path
+
+		// Skip SSE and WebSocket endpoints (long-lived connections)
+		if strings.HasSuffix(path, "/events") || strings.HasSuffix(path, "/ws") || strings.HasSuffix(path, "/stream") {
+			next.ServeHTTP(w, r)
+			return
+		}
+
+		start := time.Now()
+		sw := &statusResponseWriter{ResponseWriter: w, status: http.StatusOK}
+		next.ServeHTTP(sw, r)
+
+		attrs := []any{
+			"status", sw.status,
+			"method", r.Method,
+			"path", path,
+			"duration", time.Since(start),
+		}
+
+		// Add route params if present
+		vars := mux.Vars(r)
+		if name := vars["name"]; name != "" {
+			attrs = append(attrs, "instance", name)
+		}
+		if app := vars["app"]; app != "" {
+			attrs = append(attrs, "app", app)
+		}
+		if node := vars["node"]; node != "" {
+			attrs = append(attrs, "node", node)
+		}
+
+		if sw.status >= 400 {
+			slog.Error("request", attrs...)
+		} else {
+			slog.Info("request", attrs...)
+		}
+	})
+}
+
 // contextKey is a type for context keys to avoid collisions.
 type contextKey string

--- a/api/internal/api/v1/requests.go
+++ b/api/internal/api/v1/requests.go
@@ -1,8 +1,22 @@
 package v1

+import (
+	"fmt"
+	"strings"
+)
+
 // Request types for API endpoints.
 // These are shared across handlers to ensure consistency and reduce duplication.

+// validateTalosVersion checks that a Talos version string has the required "v" prefix.
+// The Talos Image Factory requires version tags like "v1.13.0", not "1.13.0".
+func validateTalosVersion(version string) error {
+	if !strings.HasPrefix(version, "v") {
+		return fmt.Errorf("invalid Talos version %q: must start with 'v' (e.g. v1.13.0)", version)
+	}
+	return nil
+}
+
 // CreateInstanceRequest is the request body for creating a new instance.
 type CreateInstanceRequest struct {
 	Name string `json:"name"`
@@ -75,15 +89,15 @@ type RestoreRequest struct {

 // ScheduleCreateRequest is the request body for creating a backup schedule.
 type ScheduleCreateRequest struct {
-	Name       string                  `json:"name"`
-	TargetType string                  `json:"target_type"` // "app" or "cluster"
-	TargetName string                  `json:"target_name"`
-	Frequency  string                  `json:"frequency"` // "daily", "weekly", "monthly"
-	Time       string                  `json:"time"`      // "HH:MM"
-	DayOfWeek  int                     `json:"day_of_week,omitempty"`
-	DayOfMonth int                     `json:"day_of_month,omitempty"`
-	Retention  *ScheduleRetentionReq   `json:"retention,omitempty"`
-	Enabled    bool                    `json:"enabled"`
+	Name       string                `json:"name"`
+	TargetType string                `json:"target_type"` // "app" or "cluster"
+	TargetName string                `json:"target_name"`
+	Frequency  string                `json:"frequency"` // "daily", "weekly", "monthly"
+	Time       string                `json:"time"`      // "HH:MM"
+	DayOfWeek  int                   `json:"day_of_week,omitempty"`
+	DayOfMonth int                   `json:"day_of_month,omitempty"`
+	Retention  *ScheduleRetentionReq `json:"retention,omitempty"`
+	Enabled    bool                  `json:"enabled"`
 }

 // ScheduleRetentionReq is the retention override in a schedule request.
@@ -94,13 +108,13 @@ type ScheduleRetentionReq struct {

 // ScheduleUpdateRequest is the request body for updating a backup schedule.
 type ScheduleUpdateRequest struct {
-	Name       *string                 `json:"name,omitempty"`
-	Frequency  *string                 `json:"frequency,omitempty"`
-	Time       *string                 `json:"time,omitempty"`
-	DayOfWeek  *int                    `json:"day_of_week,omitempty"`
-	DayOfMonth *int                    `json:"day_of_month,omitempty"`
-	Retention  *ScheduleRetentionReq   `json:"retention,omitempty"`
-	Enabled    *bool                   `json:"enabled,omitempty"`
+	Name       *string               `json:"name,omitempty"`
+	Frequency  *string               `json:"frequency,omitempty"`
+	Time       *string               `json:"time,omitempty"`
+	DayOfWeek  *int                  `json:"day_of_week,omitempty"`
+	DayOfMonth *int                  `json:"day_of_month,omitempty"`
+	Retention  *ScheduleRetentionReq `json:"retention,omitempty"`
+	Enabled    *bool                 `json:"enabled,omitempty"`
 }

 // NodeUpgradeRequest is the request body for upgrading a node's Talos version.
--- a/api/internal/apps/apps.go
+++ b/api/internal/apps/apps.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -11,6 +12,7 @@ import (

 	"gopkg.in/yaml.v3"

+	wcconfig "github.com/wild-cloud/wild-central/daemon/internal/config"
 	"github.com/wild-cloud/wild-central/daemon/internal/operations"
 	"github.com/wild-cloud/wild-central/daemon/internal/secrets"
 	"github.com/wild-cloud/wild-central/daemon/internal/storage"
@@ -471,29 +473,19 @@ func fetchIngressURLs(kubeconfigPath string) map[string]string {
 	return result
 }

-
 // processSecretTemplate processes a gomplate template for secret defaults
 // This function uses named contexts for config and secrets (e.g., {{ .config.apps.loomio.db.user }}, {{ .secrets.apps.loomio.dbPassword }})
-func processSecretTemplate(template string, appName string, configFile, secretsFile string, gomplate *tools.Gomplate) (string, error) {
-	// Create merged context file with app-specific config under "app" key
-	mergedContextFile := filepath.Join(filepath.Dir(configFile), fmt.Sprintf(".merged-secrets.%s.tmp.yaml", appName))
-	defer os.Remove(mergedContextFile)
-
-	// Load root config
-	rootData, err := os.ReadFile(configFile)
-	if err != nil {
-		return "", fmt.Errorf("failed to read config file: %w", err)
-	}
-
-	var rootConfig map[string]interface{}
-	if err := yaml.Unmarshal(rootData, &rootConfig); err != nil {
-		return "", fmt.Errorf("failed to parse config: %w", err)
+func processSecretTemplate(template string, appName string, configMap map[string]interface{}, secretsFile string, gomplate *tools.Gomplate) (string, error) {
+	// Build context: start with full config, add app-specific keys
+	context := make(map[string]interface{})
+	for k, v := range configMap {
+		context[k] = v
 	}

 	// Extract app-specific config and add it under "app" key
-	if apps, ok := rootConfig["apps"].(map[string]interface{}); ok {
+	if apps, ok := context["apps"].(map[string]interface{}); ok {
 		if appConfig, ok := apps[appName].(map[string]interface{}); ok {
-			rootConfig["app"] = appConfig
+			context["app"] = appConfig
 		}
 	}

@@ -508,28 +500,18 @@ func processSecretTemplate(template string, appName string, configFile, secretsF
 		return "", fmt.Errorf("failed to parse secrets: %w", err)
 	}

-	// Extract app-specific secrets and add them under "secrets" key
 	if apps, ok := allSecrets["apps"].(map[string]interface{}); ok {
 		if appSecrets, ok := apps[appName].(map[string]interface{}); ok {
-			rootConfig["secrets"] = appSecrets
+			context["secrets"] = appSecrets
 		}
 	}

-	// Write merged config
-	mergedYAML, err := yaml.Marshal(rootConfig)
+	contextYAML, err := yaml.Marshal(context)
 	if err != nil {
-		return "", fmt.Errorf("failed to marshal merged config: %w", err)
-	}
-	if err := storage.WriteFile(mergedContextFile, mergedYAML, 0644); err != nil {
-		return "", fmt.Errorf("failed to write merged config: %w", err)
+		return "", fmt.Errorf("failed to marshal context: %w", err)
 	}

-	args := []string{
-		"-i", template,
-		"-c", fmt.Sprintf(".=%s", mergedContextFile),
-	}
-
-	compiled, err := gomplate.Exec(args...)
+	compiled, err := gomplate.RenderTemplate(template, string(contextYAML))
 	if err != nil {
 		return "", fmt.Errorf("failed to process template: %w", err)
 	}
@@ -540,7 +522,7 @@ func processSecretTemplate(template string, appName string, configFile, secretsF
 // ensureDefaultSecrets generates any missing secrets defined in a manifest.
 // Existing secrets are preserved. New secrets are either generated randomly
 // or compiled from their default template.
-func ensureDefaultSecrets(secretDefs []SecretDefinition, appName, configFile, secretsFile string) error {
+func ensureDefaultSecrets(secretDefs []SecretDefinition, appName string, configMap map[string]interface{}, secretsFile string) error {
 	secretsMgr := secrets.NewManager()
 	gomplate := tools.NewGomplate()

@@ -555,7 +537,7 @@ func ensureDefaultSecrets(secretDefs []SecretDefinition, appName, configFile, se
 		var secretValue string
 		if secretDef.Default != "" {
 			if strings.Contains(secretDef.Default, "{{") {
-				compiled, err := processSecretTemplate(secretDef.Default, appName, configFile, secretsFile, gomplate)
+				compiled, err := processSecretTemplate(secretDef.Default, appName, configMap, secretsFile, gomplate)
 				if err != nil {
 					return fmt.Errorf("failed to compile secret template for %s: %w", secretDef.Key, err)
 				}
@@ -608,6 +590,8 @@ func setNestedConfig(yq *tools.YQ, configFile, basePath string, value interface{

 // Add adds an app to the instance configuration
 func (m *Manager) Add(instanceName, appName, version string, config map[string]interface{}, requiredAppMappings map[string]string) error {
+	slog.Info("adding app", "component", "apps", "instance", instanceName, "app", appName, "version", version)
+
 	// 1. Verify app exists, optionally at a specific version
 	sourceAppDir, meta, err := m.resolveAppDir(appName, version)
 	if err != nil {
@@ -624,6 +608,12 @@ func (m *Manager) Add(instanceName, appName, version string, config map[string]i
 		return fmt.Errorf("instance config not found: %s", instanceName)
 	}

+	// Load merged config (global + instance) for template resolution
+	mergedConfig, err := wcconfig.LoadMergedInstanceConfig(m.dataDir, instanceName)
+	if err != nil {
+		return fmt.Errorf("failed to load merged config: %w", err)
+	}
+
 	// Create app directory structure
 	if err := storage.EnsureDir(appDestDir, 0755); err != nil {
 		return fmt.Errorf("failed to create app directory: %w", err)
@@ -663,14 +653,14 @@ func (m *Manager) Add(instanceName, appName, version string, config map[string]i
 		// Process config in order from manifest YAML to handle {{ .app.X }} references correctly
 		// Use the source manifest since the destination hasn't been copied yet
 		sourceManifestPath := filepath.Join(sourceAppDir, "manifest.yaml")
-		if err := processConfigInOrder(sourceManifestPath, appName, configFile); err != nil {
+		if err := processConfigInOrder(sourceManifestPath, appName, configFile, mergedConfig); err != nil {
 			return fmt.Errorf("failed to process config in order: %w", err)
 		}

 		// Apply user-provided config overrides (process templates first)
 		if len(config) > 0 {
 			gomplate := tools.NewGomplate()
-			processedConfig, err := processUserConfig(config, appName, configFile, gomplate)
+			processedConfig, err := processUserConfig(config, appName, mergedConfig, gomplate)
 			if err != nil {
 				return fmt.Errorf("failed to process user config: %w", err)
 			}
@@ -689,8 +679,12 @@ func (m *Manager) Add(instanceName, appName, version string, config map[string]i
 		return err
 	}

-	// 4. Generate required secrets
-	if err := ensureDefaultSecrets(manifest.DefaultSecrets, appName, configFile, secretsFile); err != nil {
+	// 4. Generate required secrets — reload merged config since processConfigInOrder wrote new keys
+	mergedConfig, err = wcconfig.LoadMergedInstanceConfig(m.dataDir, instanceName)
+	if err != nil {
+		return fmt.Errorf("failed to reload merged config: %w", err)
+	}
+	if err := ensureDefaultSecrets(manifest.DefaultSecrets, appName, mergedConfig, secretsFile); err != nil {
 		return err
 	}

@@ -782,11 +776,14 @@ func (m *Manager) Add(instanceName, appName, version string, config map[string]i
 		return fmt.Errorf("failed to compile app templates: %w", err)
 	}

+	slog.Info("app added", "component", "apps", "instance", instanceName, "app", appName)
 	return nil
 }

 // Deploy deploys an app to the cluster
 func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster *operations.Broadcaster) error {
+	slog.Info("deploying app", "component", "apps", "instance", instanceName, "app", appName)
+
 	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
 	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
 	secretsFile := tools.GetInstanceSecretsPath(m.dataDir, instanceName)
@@ -812,7 +809,9 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
 	if storage.FileExists(manifestPath) {
 		manifestData, err := os.ReadFile(manifestPath)
 		if err == nil {
-			yaml.Unmarshal(manifestData, &manifest)
+			if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
+				slog.Error("failed to parse manifest", "component", "apps", "path", manifestPath, "error", err)
+			}
 		}
 	}

@@ -884,7 +883,7 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
 		for _, secretName := range wildcardSecrets {
 			if bytes.Contains(ingressContent, []byte(secretName)) {
 				if err := utilities.CopySecretBetweenNamespaces(kubeconfigPath, secretName, "cert-manager", namespace); err != nil {
-					fmt.Printf("Warning: Failed to copy TLS secret %s: %v\n", secretName, err)
+					slog.Error("failed to copy TLS secret", "component", "apps", "secret", secretName, "error", err)
 				}
 			}
 		}
@@ -1012,6 +1011,7 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
 		}
 	}

+	slog.Info("app deployed", "component", "apps", "instance", instanceName, "app", appName, "namespace", namespace)
 	return nil
 }

@@ -1035,6 +1035,8 @@ func (m *Manager) waitForRollout(kubeconfigPath, namespace string, wait *Rollout

 // Restart performs a rolling restart of all deployments and statefulsets in an app's namespace
 func (m *Manager) Restart(instanceName, appName string) error {
+	slog.Info("restarting app", "component", "apps", "instance", instanceName, "app", appName)
+
 	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
 	namespace := m.ResolveNamespace(instanceName, appName)

@@ -1083,6 +1085,8 @@ func (m *Manager) namespaceSharedByOtherApp(instanceName, appName, namespace str

 // Delete removes an app from the cluster and configuration
 func (m *Manager) Delete(instanceName, appName string) error {
+	slog.Info("deleting app", "component", "apps", "instance", instanceName, "app", appName)
+
 	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
 	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
 	configFile := tools.GetInstanceConfigPath(m.dataDir, instanceName)
@@ -1146,6 +1150,7 @@ func (m *Manager) Delete(instanceName, appName string) error {
 		}
 	}

+	slog.Info("app deleted", "component", "apps", "instance", instanceName, "app", appName)
 	return nil
 }

@@ -1174,8 +1179,12 @@ func (m *Manager) GetStatus(instanceName, appName string) (*DeployedApp, error)
 	manifestPath := filepath.Join(appDir, "manifest.yaml")
 	var manifest AppManifest
 	if storage.FileExists(manifestPath) {
-		manifestData, _ := os.ReadFile(manifestPath)
-		yaml.Unmarshal(manifestData, &manifest)
+		manifestData, err := os.ReadFile(manifestPath)
+		if err == nil {
+			if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
+				slog.Error("failed to parse manifest", "component", "apps", "path", manifestPath, "error", err)
+			}
+		}
 		app.Version = manifest.Version
 	}

@@ -1492,6 +1501,20 @@ func (m *Manager) GetEnhancedStatus(instanceName, appName string) (*RuntimeStatu
 	return m.getRuntimeStatus(kubeconfigPath, namespace)
 }

+// GetCatalogReadme returns the README.md content for an available app from the catalog directory.
+func (m *Manager) GetCatalogReadme(appName string) ([]byte, error) {
+	if m.appsDir == "" {
+		return nil, fmt.Errorf("apps directory not configured")
+	}
+
+	appDir, _, err := m.resolveAppDir(appName, "")
+	if err != nil {
+		return nil, err
+	}
+
+	return os.ReadFile(filepath.Join(appDir, "README.md"))
+}
+
 // GetAppManifest reads and parses the manifest.yaml for an app from the apps directory
 func (m *Manager) GetAppManifest(appName string) (*AppManifest, error) {
 	if m.appsDir == "" {
@@ -1651,16 +1674,21 @@ func (m *Manager) updateFromSource(instanceName, appName, sourceDir, preserveSou
 		return fmt.Errorf("failed to backup old package: %w", err)
 	}
 	if err := os.Rename(tempDir, packageDir); err != nil {
-		os.Rename(oldPackageDir, packageDir)
+		_ = os.Rename(oldPackageDir, packageDir)
 		return fmt.Errorf("failed to update package: %w", err)
 	}

 	configFile := tools.GetInstanceConfigPath(m.dataDir, instanceName)
 	secretsFile := tools.GetInstanceSecretsPath(m.dataDir, instanceName)

+	mergedConfig, err := wcconfig.LoadMergedInstanceConfig(m.dataDir, instanceName)
+	if err != nil {
+		return fmt.Errorf("failed to load merged config: %w", err)
+	}
+
 	rollback := func() {
 		os.RemoveAll(packageDir)
-		os.Rename(oldPackageDir, packageDir)
+		_ = os.Rename(oldPackageDir, packageDir)
 	}

 	// Read the new manifest
@@ -1680,14 +1708,19 @@ func (m *Manager) updateFromSource(instanceName, appName, sourceDir, preserveSou
 	// Merge new defaultConfig keys into config.yaml (skips existing values)
 	configLock := configFile + ".lock"
 	if err := storage.WithLock(configLock, func() error {
-		return processConfigInOrder(newManifestPath, appName, configFile)
+		return processConfigInOrder(newManifestPath, appName, configFile, mergedConfig)
 	}); err != nil {
 		rollback()
 		return fmt.Errorf("failed to merge new config: %w", err)
 	}

-	// Generate any new defaultSecrets that don't exist yet
-	if err := ensureDefaultSecrets(newManifest.DefaultSecrets, appName, configFile, secretsFile); err != nil {
+	// Generate any new defaultSecrets — reload merged config since processConfigInOrder wrote new keys
+	mergedConfig, err = wcconfig.LoadMergedInstanceConfig(m.dataDir, instanceName)
+	if err != nil {
+		rollback()
+		return fmt.Errorf("failed to reload merged config: %w", err)
+	}
+	if err := ensureDefaultSecrets(newManifest.DefaultSecrets, appName, mergedConfig, secretsFile); err != nil {
 		rollback()
 		return err
 	}
@@ -1917,7 +1950,7 @@ func (m *Manager) runMigrationJobs(instanceName, appName string, jobPaths []stri
 		// Clean up the job
 		cmd = exec.Command("kubectl", "delete", "-f", jobFile, "-n", namespace, "--ignore-not-found")
 		tools.WithKubeconfig(cmd, kubeconfigPath)
-		cmd.CombinedOutput() // Best effort cleanup
+		_, _ = cmd.CombinedOutput() // Best effort cleanup
 	}

 	return nil
@@ -1925,6 +1958,8 @@ func (m *Manager) runMigrationJobs(instanceName, appName string, jobPaths []stri

 // Eject converts an app from package-managed to custom
 func (m *Manager) Eject(instanceName, appName string) error {
+	slog.Info("ejecting app to custom management", "component", "apps", "instance", instanceName, "app", appName)
+
 	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
 	appDestDir := filepath.Join(instancePath, "apps", appName)
 	packageDir := filepath.Join(appDestDir, ".package")
@@ -2120,6 +2155,7 @@ func (m *Manager) Compile(instanceName, appName string) error {
 		return fmt.Errorf("app %s has no package source (custom or not installed)", appName)
 	}

+	slog.Info("compiling app templates", "component", "apps", "instance", instanceName, "app", appName)
 	return m.compileFromPackage(appName, appDestDir, packageDir, configFile, secretsFile)
 }

@@ -2214,7 +2250,9 @@ func (m *Manager) Fetch(instanceName, appName string) error {

 			manifestYAML, err := yaml.Marshal(manifest)
 			if err == nil {
-				storage.WriteFile(manifestPath, manifestYAML, 0644)
+				if err := storage.WriteFile(manifestPath, manifestYAML, 0644); err != nil {
+					slog.Error("failed to write manifest", "component", "apps", "path", manifestPath, "error", err)
+				}
 			}
 		}
 	}
@@ -2318,31 +2356,26 @@ func (m *Manager) getAppURL(kubeconfigPath, namespace string) string {

 // processUserConfig processes user-provided config values, compiling any templates
 // Reuses existing processValueNode logic by converting to YAML and back
-func processUserConfig(config map[string]interface{}, appName, configFile string, gomplate *tools.Gomplate) (map[string]interface{}, error) {
-	// Convert map to YAML bytes
-	configYAML, err := yaml.Marshal(config)
+func processUserConfig(userConfig map[string]interface{}, appName string, configMap map[string]interface{}, gomplate *tools.Gomplate) (map[string]interface{}, error) {
+	configYAML, err := yaml.Marshal(userConfig)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal config: %w", err)
 	}

-	// Parse into yaml.Node to use existing processValueNode
 	var node yaml.Node
 	if err := yaml.Unmarshal(configYAML, &node); err != nil {
 		return nil, fmt.Errorf("failed to parse config: %w", err)
 	}

-	// Process using existing template compilation logic
-	// Note: processValueNode expects the root mapping node's content
 	if node.Kind != yaml.DocumentNode || len(node.Content) == 0 {
-		return config, nil
+		return userConfig, nil
 	}

-	processed, err := processValueNode(node.Content[0], appName, configFile, nil, gomplate)
+	processed, err := processValueNode(node.Content[0], appName, configMap, nil, gomplate)
 	if err != nil {
 		return nil, err
 	}

-	// Convert result back to map
 	result, ok := processed.(map[string]interface{})
 	if !ok {
 		return nil, fmt.Errorf("unexpected result type from processValueNode: %T", processed)
@@ -2351,47 +2384,25 @@ func processUserConfig(config map[string]interface{}, appName, configFile string
 	return result, nil
 }

-// processConfigInOrder processes config keys in the order they appear in the manifest YAML
 // processValueNode recursively processes a yaml.Node value, compiling templates in all scalar (leaf) nodes
-func processValueNode(node *yaml.Node, appName, configFile string, appContext map[string]interface{}, gomplate *tools.Gomplate) (interface{}, error) {
+func processValueNode(node *yaml.Node, appName string, configMap map[string]interface{}, appContext map[string]interface{}, gomplate *tools.Gomplate) (interface{}, error) {
 	switch node.Kind {
 	case yaml.ScalarNode:
 		value := node.Value
-		// Process templates if value contains {{
 		if strings.Contains(value, "{{") {
-			// Create merged context file
-			mergedContextFile := filepath.Join(filepath.Dir(configFile), fmt.Sprintf(".merged.%s.tmp.yaml", appName))
-			defer os.Remove(mergedContextFile)
+			// Build context: full config + app-specific under "app" key
+			context := make(map[string]interface{})
+			for k, v := range configMap {
+				context[k] = v
+			}
+			context["app"] = appContext

-			// Load root config
-			rootData, err := os.ReadFile(configFile)
+			contextYAML, err := yaml.Marshal(context)
 			if err != nil {
-				return nil, fmt.Errorf("failed to read config: %w", err)
+				return nil, fmt.Errorf("failed to marshal context: %w", err)
 			}

-			var rootConfig map[string]interface{}
-			if err := yaml.Unmarshal(rootData, &rootConfig); err != nil {
-				return nil, fmt.Errorf("failed to parse config: %w", err)
-			}
-
-			// Merge app context under the "app" key
-			rootConfig["app"] = appContext
-
-			// Write merged config
-			mergedYAML, err := yaml.Marshal(rootConfig)
-			if err != nil {
-				return nil, fmt.Errorf("failed to marshal merged config: %w", err)
-			}
-			if err := storage.WriteFile(mergedContextFile, mergedYAML, 0644); err != nil {
-				return nil, fmt.Errorf("failed to write merged config: %w", err)
-			}
-
-			// Process template with merged context
-			args := []string{
-				"-i", value,
-				"-c", fmt.Sprintf(".=%s", mergedContextFile),
-			}
-			compiled, err := gomplate.Exec(args...)
+			compiled, err := gomplate.RenderTemplate(value, string(contextYAML))
 			if err != nil {
 				return nil, fmt.Errorf("failed to compile template: %w", err)
 			}
@@ -2400,10 +2411,9 @@ func processValueNode(node *yaml.Node, appName, configFile string, appContext ma
 		return value, nil

 	case yaml.SequenceNode:
-		// Handle arrays - process each element
 		var arr []interface{}
 		for _, item := range node.Content {
-			processed, err := processValueNode(item, appName, configFile, appContext, gomplate)
+			processed, err := processValueNode(item, appName, configMap, appContext, gomplate)
 			if err != nil {
 				return nil, err
 			}
@@ -2412,14 +2422,13 @@ func processValueNode(node *yaml.Node, appName, configFile string, appContext ma
 		return arr, nil

 	case yaml.MappingNode:
-		// Handle nested objects - recursively process each key-value pair
 		result := make(map[string]interface{})
 		for i := 0; i < len(node.Content); i += 2 {
 			keyNode := node.Content[i]
 			valueNode := node.Content[i+1]

 			key := keyNode.Value
-			processed, err := processValueNode(valueNode, appName, configFile, appContext, gomplate)
+			processed, err := processValueNode(valueNode, appName, configMap, appContext, gomplate)
 			if err != nil {
 				return nil, fmt.Errorf("failed to process nested key %s: %w", key, err)
 			}
@@ -2432,14 +2441,12 @@ func processValueNode(node *yaml.Node, appName, configFile string, appContext ma
 	}
 }

-func processConfigInOrder(manifestPath string, appName string, configFile string) error {
-	// Read the manifest file directly to preserve order
+func processConfigInOrder(manifestPath string, appName string, configFile string, configMap map[string]interface{}) error {
 	manifestData, err := os.ReadFile(manifestPath)
 	if err != nil {
 		return fmt.Errorf("failed to read manifest: %w", err)
 	}

-	// Parse YAML preserving order
 	var node yaml.Node
 	if err := yaml.Unmarshal(manifestData, &node); err != nil {
 		return fmt.Errorf("failed to parse manifest YAML: %w", err)
@@ -2466,7 +2473,6 @@ func processConfigInOrder(manifestPath string, appName string, configFile string
 	// Build up app context as we process values
 	appContext := make(map[string]interface{})

-	// Process each config key in order
 	for i := 0; i < len(defaultConfigNode.Content); i += 2 {
 		keyNode := defaultConfigNode.Content[i]
 		valueNode := defaultConfigNode.Content[i+1]
@@ -2474,29 +2480,27 @@ func processConfigInOrder(manifestPath string, appName string, configFile string
 		key := keyNode.Value
 		keyPath := fmt.Sprintf(".apps.%s.%s", appName, key)

-		// Check if already exists
+		// Check if already exists in the config file (yq reads from actual file)
 		existing, _ := yq.Get(configFile, keyPath)
 		if existing != "" && existing != "null" {
-			// Parse the existing value and add to context for later references
 			var existingValue interface{}
 			if err := yaml.Unmarshal([]byte(existing), &existingValue); err == nil {
 				appContext[key] = existingValue
 			} else {
 				appContext[key] = existing
 			}
-			continue // Skip existing values
+			continue
 		}

-		// Recursively process the value node, compiling templates in all leaf nodes
-		value, err := processValueNode(valueNode, appName, configFile, appContext, gomplate)
+		// Template resolution uses merged config map (in-memory, no temp files)
+		value, err := processValueNode(valueNode, appName, configMap, appContext, gomplate)
 		if err != nil {
 			return fmt.Errorf("failed to process config key %s: %w", key, err)
 		}

-		// Add processed value to context for future references
 		appContext[key] = value

-		// Set the config value in the actual config file
+		// Write to actual config file via yq
 		if err := setNestedConfig(yq, configFile, keyPath, value); err != nil {
 			return fmt.Errorf("failed to set config %s: %w", key, err)
 		}
--- a/api/internal/apps/apps_test.go
+++ b/api/internal/apps/apps_test.go
@@ -337,8 +337,7 @@ func TestProcessSecretTemplate(t *testing.T) {
 	}
 	defer os.RemoveAll(tmpDir)

-	// Create config file
-	configFile := filepath.Join(tmpDir, "config.yaml")
+	// Build config map (replaces temp file approach)
 	configContent := `cloud:
  domain: example.com
 apps:
@@ -350,7 +349,8 @@ apps:
      user: testuser
    apiUrl: https://api.example.com
 `
-	if err := os.WriteFile(configFile, []byte(configContent), 0644); err != nil {
+	var configMap map[string]interface{}
+	if err := yaml.Unmarshal([]byte(configContent), &configMap); err != nil {
 		t.Fatal(err)
 	}

@@ -418,7 +418,7 @@ apps:

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got, err := processSecretTemplate(tt.template, "testapp", configFile, secretsFile, gomplate)
+			got, err := processSecretTemplate(tt.template, "testapp", configMap, secretsFile, gomplate)
 			if (err != nil) != tt.wantErr {
 				t.Errorf("processSecretTemplate() error = %v, wantErr %v", err, tt.wantErr)
 				return
@@ -1342,9 +1342,14 @@ source: /apps/ejectapp
 	}

 	// Verify source was removed from manifest
-	manifestData, _ := os.ReadFile(manifestPath)
+	manifestData, err := os.ReadFile(manifestPath)
+	if err != nil {
+		t.Fatalf("failed to read manifest: %v", err)
+	}
 	var manifest AppManifest
-	yaml.Unmarshal(manifestData, &manifest)
+	if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
+		t.Fatalf("failed to parse manifest: %v", err)
+	}
 	if manifest.Source != "" {
 		t.Errorf("Source should be removed from manifest after eject, got: %s", manifest.Source)
 	}
@@ -1563,10 +1568,10 @@ func TestCopyDir(t *testing.T) {

 	// Create files at various levels
 	files := map[string]string{
-		filepath.Join(srcDir, "top-level.yaml"):              "top: level",
-		filepath.Join(installDir, "install.yaml"):            "install: data",
-		filepath.Join(installDir, "nested", "deep.yaml"):     "deep: data",
-		filepath.Join(configDir, "config.yaml"):              "config: data",
+		filepath.Join(srcDir, "top-level.yaml"):          "top: level",
+		filepath.Join(installDir, "install.yaml"):        "install: data",
+		filepath.Join(installDir, "nested", "deep.yaml"): "deep: data",
+		filepath.Join(configDir, "config.yaml"):          "config: data",
 	}
 	for path, content := range files {
 		if err := os.WriteFile(path, []byte(content), 0644); err != nil {
@@ -1748,10 +1753,10 @@ deploy:

 func TestResolveDeploymentResource(t *testing.T) {
 	tests := []struct {
-		name         string
-		manifest     AppManifest
-		wantName     string
-		wantKind     string
+		name     string
+		manifest AppManifest
+		wantName string
+		wantKind string
 	}{
 		{
 			name:     "no deployment info",
@@ -1845,7 +1850,9 @@ func TestIsConfigOnly(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			appDir := t.TempDir()
 			for _, f := range tt.files {
-				os.WriteFile(filepath.Join(appDir, f), []byte("test"), 0644)
+				if err := os.WriteFile(filepath.Join(appDir, f), []byte("test"), 0644); err != nil {
+					t.Fatalf("failed to write test file: %v", err)
+				}
 			}
 			if got := isConfigOnly(appDir); got != tt.want {
 				t.Errorf("isConfigOnly() = %v, want %v", got, tt.want)
--- a/api/internal/apps/drift_test.go
+++ b/api/internal/apps/drift_test.go
@@ -38,14 +38,20 @@ func TestFilesDiffer(t *testing.T) {
 	}
 	defer os.RemoveAll(tmpDir)

+	mustWrite := func(path string, data []byte) {
+		t.Helper()
+		if err := os.WriteFile(path, data, 0644); err != nil {
+			t.Fatal(err)
+		}
+	}
 	fileA := filepath.Join(tmpDir, "a.txt")
 	fileB := filepath.Join(tmpDir, "b.txt")
 	fileC := filepath.Join(tmpDir, "c.txt")
 	fileMissing := filepath.Join(tmpDir, "missing.txt")

-	os.WriteFile(fileA, []byte("hello"), 0644)
-	os.WriteFile(fileB, []byte("hello"), 0644)
-	os.WriteFile(fileC, []byte("world"), 0644)
+	mustWrite(fileA, []byte("hello"))
+	mustWrite(fileB, []byte("hello"))
+	mustWrite(fileC, []byte("world"))

 	t.Run("identical files", func(t *testing.T) {
 		if filesDiffer(fileA, fileB) {
@@ -79,13 +85,26 @@ func TestDirsDiffer(t *testing.T) {
 	}
 	defer os.RemoveAll(tmpDir)

+	mustMkdir := func(path string) {
+		t.Helper()
+		if err := os.MkdirAll(path, 0755); err != nil {
+			t.Fatal(err)
+		}
+	}
+	mustWriteFile := func(path string, data []byte) {
+		t.Helper()
+		if err := os.WriteFile(path, data, 0644); err != nil {
+			t.Fatal(err)
+		}
+	}
+
 	// Create two identical directories
 	dirA := filepath.Join(tmpDir, "a")
 	dirB := filepath.Join(tmpDir, "b")
-	os.MkdirAll(dirA, 0755)
-	os.MkdirAll(dirB, 0755)
-	os.WriteFile(filepath.Join(dirA, "file.txt"), []byte("same"), 0644)
-	os.WriteFile(filepath.Join(dirB, "file.txt"), []byte("same"), 0644)
+	mustMkdir(dirA)
+	mustMkdir(dirB)
+	mustWriteFile(filepath.Join(dirA, "file.txt"), []byte("same"))
+	mustWriteFile(filepath.Join(dirB, "file.txt"), []byte("same"))

 	t.Run("identical directories", func(t *testing.T) {
 		if dirsDiffer(dirA, dirB) {
@@ -95,8 +114,8 @@ func TestDirsDiffer(t *testing.T) {

 	// Create a directory with different content
 	dirC := filepath.Join(tmpDir, "c")
-	os.MkdirAll(dirC, 0755)
-	os.WriteFile(filepath.Join(dirC, "file.txt"), []byte("different"), 0644)
+	mustMkdir(dirC)
+	mustWriteFile(filepath.Join(dirC, "file.txt"), []byte("different"))

 	t.Run("different content", func(t *testing.T) {
 		if !dirsDiffer(dirA, dirC) {
@@ -106,9 +125,9 @@ func TestDirsDiffer(t *testing.T) {

 	// Directory with extra file
 	dirD := filepath.Join(tmpDir, "d")
-	os.MkdirAll(dirD, 0755)
-	os.WriteFile(filepath.Join(dirD, "file.txt"), []byte("same"), 0644)
-	os.WriteFile(filepath.Join(dirD, "extra.txt"), []byte("extra"), 0644)
+	mustMkdir(dirD)
+	mustWriteFile(filepath.Join(dirD, "file.txt"), []byte("same"))
+	mustWriteFile(filepath.Join(dirD, "extra.txt"), []byte("extra"))

 	t.Run("extra file in second", func(t *testing.T) {
 		if !dirsDiffer(dirA, dirD) {
@@ -126,14 +145,20 @@ func TestCheckSourceDrift_NoDrift(t *testing.T) {

 	// Create source directory with manifest
 	sourceDir := filepath.Join(tmpDir, "source", "myapp")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}
 	sourceManifest := AppManifest{Version: "1.0.0"}
 	data, _ := yaml.Marshal(sourceManifest)
-	os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
+	if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create package dir (it exists)
 	packageDir := filepath.Join(tmpDir, "package")
-	os.MkdirAll(packageDir, 0755)
+	if err := os.MkdirAll(packageDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Installed manifest with same version
 	manifest := &AppManifest{
@@ -157,14 +182,20 @@ func TestCheckSourceDrift_VersionDrift(t *testing.T) {

 	// Create source directory with newer version
 	sourceDir := filepath.Join(tmpDir, "source", "myapp")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}
 	sourceManifest := AppManifest{Version: "2.0.0"}
 	data, _ := yaml.Marshal(sourceManifest)
-	os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
+	if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// Create package dir
 	packageDir := filepath.Join(tmpDir, "package")
-	os.MkdirAll(packageDir, 0755)
+	if err := os.MkdirAll(packageDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Installed manifest with older version
 	manifest := &AppManifest{
@@ -174,11 +205,8 @@ func TestCheckSourceDrift_VersionDrift(t *testing.T) {

 	m := &Manager{}
 	result := m.checkSourceDrift(manifest, packageDir, "myapp")
-	if result == nil {
-		t.Fatal("expected drift, got nil")
-	}
-	if !result.Drifted {
-		t.Error("expected Drifted to be true")
+	if result == nil || !result.Drifted {
+		t.Fatal("expected drift result with Drifted=true, got nil or false")
 	}
 	if result.CurrentVersion != "1.0.0" {
 		t.Errorf("expected CurrentVersion '1.0.0', got %q", result.CurrentVersion)
@@ -210,7 +238,9 @@ func TestCheckSourceDrift_PackageMissing(t *testing.T) {

 	// Source exists but .package/ does not
 	sourceDir := filepath.Join(tmpDir, "source", "myapp")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	manifest := &AppManifest{
 		Version: "1.0.0",
@@ -221,11 +251,8 @@ func TestCheckSourceDrift_PackageMissing(t *testing.T) {

 	m := &Manager{}
 	result := m.checkSourceDrift(manifest, packageDir, "myapp")
-	if result == nil {
-		t.Fatal("expected drift for missing package dir, got nil")
-	}
-	if !result.Drifted {
-		t.Error("expected Drifted to be true")
+	if result == nil || !result.Drifted {
+		t.Fatal("expected drift result with Drifted=true for missing package dir")
 	}
 }

@@ -270,17 +297,23 @@ func TestComputeDrift_NotDeployed(t *testing.T) {

 	// Source-managed app that is only "added" (not deployed)
 	sourceDir := filepath.Join(tmpDir, "source")
-	os.MkdirAll(sourceDir, 0755)
+	if err := os.MkdirAll(sourceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Source manifest with newer version
 	sourceManifest := AppManifest{Version: "2.0.0"}
 	data, _ := yaml.Marshal(sourceManifest)
-	os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
+	if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
+		t.Fatal(err)
+	}

 	// App directory with .package
 	appDir := filepath.Join(tmpDir, "app")
 	packageDir := filepath.Join(appDir, ".package")
-	os.MkdirAll(packageDir, 0755)
+	if err := os.MkdirAll(packageDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	manifest := &AppManifest{
 		Version: "1.0.0",
@@ -290,13 +323,8 @@ func TestComputeDrift_NotDeployed(t *testing.T) {
 	m := &Manager{}
 	result := m.computeDrift("test-instance", "myapp", appDir, "", "added", manifest)

-	if result == nil {
-		t.Fatal("expected drift info, got nil")
-	}
-
-	// Should have source drift (version mismatch)
-	if result.Source == nil || !result.Source.Drifted {
-		t.Error("expected source drift for version mismatch")
+	if result == nil || result.Source == nil || !result.Source.Drifted {
+		t.Fatal("expected drift info with source drift for version mismatch")
 	}

 	// Should NOT have deploy drift (status is "added")
--- a/api/internal/apps/infrastructure.go
+++ b/api/internal/apps/infrastructure.go
@@ -2,7 +2,7 @@ package apps

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -110,14 +110,14 @@ func (m *Manager) DeployInfrastructure(instanceName, opID string, broadcaster *o
 	for i, pkg := range packages {
 		// Skip if already added and deployed
 		if m.isDeployed(instanceName, pkg.Name) {
-			log.Printf("[infrastructure] %s already deployed, skipping", pkg.Name)
+			slog.Info("already deployed, skipping", "component", "infrastructure", "package", pkg.Name)
 			if broadcaster != nil {
 				broadcaster.Publish(opID, []byte(fmt.Sprintf("Skipping %s (already deployed)\n", pkg.Name)))
 			}
 			continue
 		}

-		log.Printf("[infrastructure] Installing %s (%d/%d)", pkg.Name, i+1, total)
+		slog.Info("installing package", "component", "infrastructure", "package", pkg.Name, "progress", fmt.Sprintf("%d/%d", i+1, total))
 		if broadcaster != nil {
 			broadcaster.Publish(opID, []byte(fmt.Sprintf("Installing %s (%d/%d)...\n", pkg.Name, i+1, total)))
 		}
--- a/api/internal/apps/infrastructure_test.go
+++ b/api/internal/apps/infrastructure_test.go
@@ -135,8 +135,11 @@ func TestInfrastructureOrder(t *testing.T) {
 	tmpDir := t.TempDir()

 	writeManifest := func(name, category string, requires []string) {
+		t.Helper()
 		dir := filepath.Join(tmpDir, name)
-		os.MkdirAll(dir, 0755)
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		content := "name: " + name + "\ncategory: " + category + "\n"
 		if len(requires) > 0 {
@@ -145,7 +148,9 @@ func TestInfrastructureOrder(t *testing.T) {
 				content += "  - name: " + r + "\n"
 			}
 		}
-		os.WriteFile(filepath.Join(dir, "manifest.yaml"), []byte(content), 0644)
+		if err := os.WriteFile(filepath.Join(dir, "manifest.yaml"), []byte(content), 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	// Create infrastructure packages
--- a/api/internal/apps/models.go
+++ b/api/internal/apps/models.go
@@ -14,24 +14,30 @@ type ConfigItem struct {
 	Value interface{} `json:"value" yaml:"value"`
 }

+// ManifestBackupConfig declares backup behavior for an app
+type ManifestBackupConfig struct {
+	RestoreMode string `json:"restoreMode,omitempty" yaml:"restoreMode,omitempty"` // "in-place" or "" (default: standby blue/green)
+}
+
 // AppManifest represents the complete app manifest from manifest.yaml
 type AppManifest struct {
-	Name             string                 `json:"name" yaml:"name"`
-	Is               string                 `json:"is,omitempty" yaml:"is,omitempty"` // The original app type (e.g., "postgres" even if named "postgres-primary")
-	Description      string                 `json:"description" yaml:"description"`
-	Version          string                 `json:"version" yaml:"version"`
-	Icon             string                 `json:"icon,omitempty" yaml:"icon,omitempty"`
-	Category         string                 `json:"category,omitempty" yaml:"category,omitempty"`
-	Namespace        string                 `json:"namespace,omitempty" yaml:"namespace,omitempty"`
-	DeploymentName   string                 `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"`
-	Requires         []AppDependency        `json:"requires,omitempty" yaml:"requires,omitempty"`
-	DefaultConfig    map[string]interface{} `json:"defaultConfig,omitempty" yaml:"defaultConfig,omitempty"`
-	DefaultSecrets   []SecretDefinition     `json:"defaultSecrets,omitempty" yaml:"defaultSecrets,omitempty"`
-	RequiredSecrets  []string               `json:"requiredSecrets,omitempty" yaml:"requiredSecrets,omitempty"`
-	Source           string                 `json:"source,omitempty" yaml:"source,omitempty"`
-	Scripts          []Script               `json:"scripts,omitempty" yaml:"scripts,omitempty"`
-	Deploy           *DeployConfig          `json:"deploy,omitempty" yaml:"deploy,omitempty"`
-	Upgrade          *UpgradeConfig         `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
+	Name            string                 `json:"name" yaml:"name"`
+	Is              string                 `json:"is,omitempty" yaml:"is,omitempty"` // The original app type (e.g., "postgres" even if named "postgres-primary")
+	Description     string                 `json:"description" yaml:"description"`
+	Version         string                 `json:"version" yaml:"version"`
+	Icon            string                 `json:"icon,omitempty" yaml:"icon,omitempty"`
+	Category        string                 `json:"category,omitempty" yaml:"category,omitempty"`
+	Namespace       string                 `json:"namespace,omitempty" yaml:"namespace,omitempty"`
+	DeploymentName  string                 `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"`
+	Requires        []AppDependency        `json:"requires,omitempty" yaml:"requires,omitempty"`
+	DefaultConfig   map[string]interface{} `json:"defaultConfig,omitempty" yaml:"defaultConfig,omitempty"`
+	DefaultSecrets  []SecretDefinition     `json:"defaultSecrets,omitempty" yaml:"defaultSecrets,omitempty"`
+	RequiredSecrets []string               `json:"requiredSecrets,omitempty" yaml:"requiredSecrets,omitempty"`
+	Source          string                 `json:"source,omitempty" yaml:"source,omitempty"`
+	Scripts         []Script               `json:"scripts,omitempty" yaml:"scripts,omitempty"`
+	Deploy          *DeployConfig          `json:"deploy,omitempty" yaml:"deploy,omitempty"`
+	Upgrade         *UpgradeConfig         `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
+	Backup          *ManifestBackupConfig  `json:"backup,omitempty" yaml:"backup,omitempty"`
 }

 // DeployConfig declares deployment behavior in the manifest, replacing install.sh scripts
@@ -54,7 +60,7 @@ type DeployPhase struct {
 type CreateSecret struct {
 	Name      string            `json:"name" yaml:"name"`
 	Namespace string            `json:"namespace,omitempty" yaml:"namespace,omitempty"` // target namespace (defaults to app namespace)
-	Entries   map[string]string `json:"entries" yaml:"entries"`                        // k8s secret key -> secrets.yaml path
+	Entries   map[string]string `json:"entries" yaml:"entries"`                         // k8s secret key -> secrets.yaml path
 }

 // CRDInstall describes CRDs to apply from a URL before deployment
@@ -138,13 +144,13 @@ type UpgradeConfig struct {
 	From             []UpgradeFromRule `json:"from,omitempty" yaml:"from,omitempty"`
 	PreUpgrade       *PreUpgradeConfig `json:"preUpgrade,omitempty" yaml:"preUpgrade,omitempty"`
 	Migrations       *MigrationConfig  `json:"migrations,omitempty" yaml:"migrations,omitempty"`
-	ConfigMigrations map[string]string  `json:"configMigrations,omitempty" yaml:"configMigrations,omitempty"`
+	ConfigMigrations map[string]string `json:"configMigrations,omitempty" yaml:"configMigrations,omitempty"`
 }

 // UpgradeFromRule defines a version constraint and optional upgrade path
 type UpgradeFromRule struct {
-	Version string `json:"version" yaml:"version"`                        // e.g. ">=1.23.0", "<1.21.0", ">0"
-	Via     string `json:"via,omitempty" yaml:"via,omitempty"`             // waypoint version in versions/
+	Version string `json:"version" yaml:"version"`             // e.g. ">=1.23.0", "<1.21.0", ">0"
+	Via     string `json:"via,omitempty" yaml:"via,omitempty"` // waypoint version in versions/
 	Blocked bool   `json:"blocked,omitempty" yaml:"blocked,omitempty"`
 	Notes   string `json:"notes,omitempty" yaml:"notes,omitempty"`
 }
@@ -157,7 +163,7 @@ type PreUpgradeConfig struct {

 // MigrationConfig defines pre/post-deploy migration jobs for a version transition
 type MigrationConfig struct {
-	Pre  []string `json:"pre,omitempty" yaml:"pre,omitempty"`   // paths to K8s Job YAMLs relative to app dir
+	Pre  []string `json:"pre,omitempty" yaml:"pre,omitempty"` // paths to K8s Job YAMLs relative to app dir
 	Post []string `json:"post,omitempty" yaml:"post,omitempty"`
 }

--- a/api/internal/apps/upgrade.go
+++ b/api/internal/apps/upgrade.go
@@ -43,7 +43,7 @@ func ParseAppVersion(v string) (major, minor, patch, revision int) {
 		}
 	}

-	fmt.Sscanf(upstream, "%d.%d.%d", &major, &minor, &patch)
+	_, _ = fmt.Sscanf(upstream, "%d.%d.%d", &major, &minor, &patch)
 	return
 }

--- a/api/internal/apps/upgrade_test.go
+++ b/api/internal/apps/upgrade_test.go
@@ -10,7 +10,7 @@ import (

 func TestParseAppVersion(t *testing.T) {
 	tests := []struct {
-		input                          string
+		input                         string
 		major, minor, patch, revision int
 	}{
 		{"1.24.3-1", 1, 24, 3, 1},
@@ -347,9 +347,9 @@ func TestComputeUpgradePlan_MultipleWaypoints(t *testing.T) {
 		Latest: "4",
 		Upgrade: &UpgradeConfig{
 			From: []UpgradeFromRule{
-				{Version: ">=3.0.0"},               // direct from 3.x
-				{Version: ">=2.0.0", Via: "3"},     // 2.x must go through slot "3"
-				{Version: ">=1.0.0", Via: "2"},     // 1.x must go through slot "2"
+				{Version: ">=3.0.0"},           // direct from 3.x
+				{Version: ">=2.0.0", Via: "3"}, // 2.x must go through slot "3"
+				{Version: ">=1.0.0", Via: "2"}, // 1.x must go through slot "2"
 			},
 		},
 	})
@@ -524,7 +524,7 @@ func TestComputeUpgradePlan_RuleOrdering(t *testing.T) {
 		Latest: "3",
 		Upgrade: &UpgradeConfig{
 			From: []UpgradeFromRule{
-				{Version: ">=2.0.0"},                                            // direct for 2.x+
+				{Version: ">=2.0.0"}, // direct for 2.x+
 				{Version: ">=1.0.0", Blocked: true, Notes: "must be on 2.x+"}, // block for 1.x
 			},
 		},
--- a/api/internal/assets/assets.go
+++ b/api/internal/assets/assets.go
@@ -1,7 +1,6 @@
 package assets

 import (
-	"crypto/sha256"
 	"fmt"
 	"io"
 	"net/http"
@@ -167,10 +166,6 @@ func (m *Manager) listAssetFiles(schematicID, version string) ([]Asset, error) {

 		if err == nil && info != nil {
 			asset.Size = info.Size()
-			// Calculate SHA256 if file exists
-			if hash, err := calculateSHA256(assetPath); err == nil {
-				asset.SHA256 = hash
-			}
 		}

 		assets = append(assets, asset)
@@ -191,10 +186,6 @@ func (m *Manager) listAssetFiles(schematicID, version string) ([]Asset, error) {

 			if err == nil && info != nil {
 				asset.Size = info.Size()
-				// Calculate SHA256 if file exists
-				if hash, err := calculateSHA256(isoPath); err == nil {
-					asset.SHA256 = hash
-				}
 			}

 			assets = append(assets, asset)
@@ -430,19 +421,3 @@ func (m *Manager) DeleteAsset(schematicID, version string) error {

 	return os.RemoveAll(assetDir)
 }
-
-// calculateSHA256 computes the SHA256 hash of a file
-func calculateSHA256(filePath string) (string, error) {
-	file, err := os.Open(filePath)
-	if err != nil {
-		return "", err
-	}
-	defer file.Close()
-
-	hash := sha256.New()
-	if _, err := io.Copy(hash, file); err != nil {
-		return "", err
-	}
-
-	return fmt.Sprintf("%x", hash.Sum(nil)), nil
-}
--- a/api/internal/backup/backup.go
+++ b/api/internal/backup/backup.go
@@ -2,9 +2,13 @@
 package backup

 import (
+	"archive/tar"
 	"bytes"
+	"compress/gzip"
 	"encoding/json"
 	"fmt"
+	"io"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -184,11 +188,18 @@ func (m *Manager) BackupApp(instanceName, appName string) (*RecoveryPlan, error)
 		activeNamespace = btypes.ColoredName(appName, activeColor)
 	}

+	restoreMode := ""
+	if manifest.Backup != nil && manifest.Backup.RestoreMode == "in-place" {
+		restoreMode = "in-place"
+	}
+
 	plan := &RecoveryPlan{
-		App:       appName,
-		Instance:  instanceName,
-		Timestamp: timestamp,
-		Status:    "backing_up",
+		App:         appName,
+		Instance:    instanceName,
+		Timestamp:   timestamp,
+		Version:     manifest.Version,
+		Status:      "backing_up",
+		RestoreMode: restoreMode,
 		Source: btypes.RecoverySource{
 			ActiveColor: activeColor,
 			Namespace:   activeNamespace,
@@ -275,18 +286,26 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)

 	// Compute standby targets
 	plan.Status = "restoring"
-	standbyNamespace := btypes.ColoredName(appName, plan.StandbyColor)
-	standbyAppDir := filepath.Join("instances", instanceName, "apps", standbyNamespace)

-	plan.Standby = btypes.RecoveryStandby{
-		Namespace: standbyNamespace,
-		AppDir:    standbyAppDir,
+	if plan.RestoreMode == "in-place" {
+		// In-place: restore data to original namespace, no colored standby
+		plan.Standby = btypes.RecoveryStandby{
+			Namespace: plan.Source.Namespace,
+			AppDir:    plan.Source.AppDir,
+		}
+	} else {
+		standbyNamespace := btypes.ColoredName(appName, plan.StandbyColor)
+		standbyAppDir := filepath.Join("instances", instanceName, "apps", standbyNamespace)
+		plan.Standby = btypes.RecoveryStandby{
+			Namespace: standbyNamespace,
+			AppDir:    standbyAppDir,
+		}
 	}

 	now := time.Now()
 	plan.Phases["restore"] = PhaseTime{StartedAt: &now}

-	m.reportProgress(40, fmt.Sprintf("Restoring to %s namespace", standbyNamespace))
+	m.reportProgress(40, fmt.Sprintf("Restoring to %s namespace", plan.Standby.Namespace))

 	progressStart := 40
 	progressEnd := 80
@@ -315,18 +334,21 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
 		if err := strategy.Restore(plan, m.destination); err != nil {
 			plan.Status = "failed"
 			plan.Error = fmt.Sprintf("%s restore failed: %v", entry.Name, err)
-			m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 			return plan, fmt.Errorf("failed to restore %s: %w", entry.Name, err)
 		}
 	}

-	// Deploy standby namespace
-	m.reportProgress(85, "Deploying app to standby namespace")
-	if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
-		plan.Status = "failed"
-		plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
-		m.savePlan(instanceName, appName, plan.Timestamp, plan)
-		return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
+	// For standby mode: deploy app to the colored standby namespace.
+	// For in-place mode: skip — the Longhorn Switch phase handles the data swap.
+	if plan.RestoreMode != "in-place" {
+		m.reportProgress(85, "Deploying app to standby namespace")
+		if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
+			plan.Status = "failed"
+			plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
+		}
 	}

 	plan.Status = "restored"
@@ -335,7 +357,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
 	phase.CompletedAt = &completed
 	plan.Phases["restore"] = phase

-	m.savePlan(instanceName, appName, plan.Timestamp, plan)
+	_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 	m.reportProgress(100, "Restore completed")
 	return plan, nil
 }
@@ -370,7 +392,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
 		if err := strategy.Switch(plan); err != nil {
 			plan.Status = "failed"
 			plan.Error = fmt.Sprintf("%s switch failed: %v", entry.Name, err)
-			m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 			return plan, fmt.Errorf("failed to switch %s: %w", entry.Name, err)
 		}
 	}
@@ -380,7 +402,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
 	if err := m.setActiveDeployment(instanceName, appName, plan.StandbyColor); err != nil {
 		plan.Status = "failed"
 		plan.Error = fmt.Sprintf("failed to update activeDeployment: %v", err)
-		m.savePlan(instanceName, appName, plan.Timestamp, plan)
+		_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 		return plan, fmt.Errorf("failed to update activeDeployment: %w", err)
 	}

@@ -390,7 +412,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
 	phase.CompletedAt = &completed
 	plan.Phases["switch"] = phase

-	m.savePlan(instanceName, appName, plan.Timestamp, plan)
+	_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 	m.reportProgress(100, "Switch completed")
 	return plan, nil
 }
@@ -425,42 +447,46 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
 		if err := strategy.Cleanup(plan); err != nil {
 			plan.Status = "failed"
 			plan.Error = fmt.Sprintf("%s cleanup failed: %v", entry.Name, err)
-			m.savePlan(instanceName, appName, plan.Timestamp, plan)
+			_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 			return plan, fmt.Errorf("failed to cleanup %s: %w", entry.Name, err)
 		}
 	}

-	// Remove previous active namespace
-	m.reportProgress(80, "Removing previous namespace")
-	previousNamespace := plan.Source.Namespace
-	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
-	if previousNamespace != "" && previousNamespace != appName {
-		// Delete colored namespaces entirely
-		deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
-		tools.WithKubeconfig(deleteCmd, kubeconfigPath)
-		if output, err := deleteCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to delete previous namespace %s: %v, output: %s\n", previousNamespace, err, output)
+	// For standby mode: remove the previous active namespace and app directory.
+	// For in-place mode: the Longhorn Switch phase already cleaned up the old PVC/PV/volume;
+	// the service's namespace is the same throughout, so nothing to delete here.
+	if plan.RestoreMode != "in-place" {
+		m.reportProgress(80, "Removing previous namespace")
+		previousNamespace := plan.Source.Namespace
+		kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
+		if previousNamespace != "" && previousNamespace != appName {
+			// Delete colored namespaces entirely
+			deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
+			tools.WithKubeconfig(deleteCmd, kubeconfigPath)
+			if output, err := deleteCmd.CombinedOutput(); err != nil {
+				slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
+			}
+		} else if previousNamespace == appName {
+			// For the bare namespace (first restore), scale deployments to zero
+			// instead of deleting — keeps the namespace for future non-restore deploys
+			scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
+			tools.WithKubeconfig(scaleCmd, kubeconfigPath)
+			if output, err := scaleCmd.CombinedOutput(); err != nil {
+				slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
+			}
 		}
-	} else if previousNamespace == appName {
-		// For the bare namespace (first restore), scale deployments to zero
-		// instead of deleting — keeps the namespace for future non-restore deploys
-		scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
-		tools.WithKubeconfig(scaleCmd, kubeconfigPath)
-		if output, err := scaleCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to scale down previous deployments in %s: %v, output: %s\n", previousNamespace, err, output)
-		}
-	}

-	// Remove previous active app directory
-	previousAppDir := plan.Source.AppDir
-	if previousAppDir != "" {
-		absPath := previousAppDir
-		if !filepath.IsAbs(absPath) {
-			absPath = filepath.Join(m.dataDir, absPath)
-		}
-		// Only remove if it's a colored directory (not the bare app dir)
-		if strings.Contains(filepath.Base(absPath), "-") {
-			os.RemoveAll(absPath)
+		// Remove previous active app directory
+		previousAppDir := plan.Source.AppDir
+		if previousAppDir != "" {
+			absPath := previousAppDir
+			if !filepath.IsAbs(absPath) {
+				absPath = filepath.Join(m.dataDir, absPath)
+			}
+			// Only remove if it's a colored directory (not the bare app dir)
+			if strings.Contains(filepath.Base(absPath), "-") {
+				os.RemoveAll(absPath)
+			}
 		}
 	}

@@ -470,7 +496,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
 	phase.CompletedAt = &completed
 	plan.Phases["cleanup"] = phase

-	m.savePlan(instanceName, appName, plan.Timestamp, plan)
+	_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
 	m.reportProgress(100, "Cleanup completed")
 	return plan, nil
 }
@@ -596,10 +622,10 @@ func (m *Manager) deployToStandbyNamespace(instanceName, appName string, plan *R

 	// Create secrets from secrets.yaml (source of truth) in the standby namespace
 	if err := m.deploySecretsToNamespace(instanceName, appName, standbyNamespace, kubeconfigPath); err != nil {
-		fmt.Printf("Warning: failed to deploy secrets to standby namespace: %v\n", err)
+		slog.Error("failed to deploy secrets to standby namespace", "component", "backup", "instance", instanceName, "app", appName, "error", err)
 	}

-	fmt.Printf("Successfully deployed app to standby namespace: %s\n", standbyNamespace)
+	slog.Info("deployed app to standby namespace", "component", "backup", "namespace", standbyNamespace)
 	return nil
 }

@@ -1076,25 +1102,22 @@ func (m *Manager) DeleteAppBackup(instanceName, appName, timestamp string) error
 	backupDir := filepath.Join(m.GetBackupDir(instanceName), appName, timestamp)

 	if _, err := os.Stat(backupDir); os.IsNotExist(err) {
-		return fmt.Errorf("backup not found: %s", timestamp)
+		return nil // Already deleted, nothing to do
 	}

 	// Load plan to get strategy locations
 	planFile := filepath.Join(backupDir, "recovery-plan.yaml")
 	plan, err := m.loadPlan(planFile)

-	// Load destination
+	// Load destination and clean up remote files (best-effort)
 	destination, err2 := m.loadDestination(instanceName)
 	if err2 != nil {
-		return fmt.Errorf("failed to load backup destination: %w", err2)
-	}
-
-	// Delete strategy data from destination
-	if err == nil && plan != nil {
+		slog.Error("could not load backup destination, remote files may be orphaned", "component", "backup", "error", err2)
+	} else if err == nil && plan != nil {
 		for _, entry := range plan.Strategies {
 			if location, ok := entry.Backup["location"].(string); ok && location != "" {
 				if delErr := destination.Delete(location); delErr != nil {
-					fmt.Printf("Warning: failed to delete %s from destination: %v\n", location, delErr)
+					slog.Error("failed to delete backup from destination", "component", "backup", "location", location, "error", delErr)
 				}
 			}
 		}
@@ -1241,6 +1264,149 @@ func (m *Manager) loadDestination(instanceName string) (BackupDestination, error
 	}
 }

+// BackupClusterConfig creates a backup of cluster-level configuration files for disaster recovery.
+// This backs up kubeconfig, talosconfig, config.yaml, secrets.yaml, and talos generated configs.
+func (m *Manager) BackupClusterConfig(instanceName string) (*RecoveryPlan, error) {
+	m.reportProgress(20, "Loading backup configuration")
+
+	destination, err := m.loadDestination(instanceName)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load backup destination: %w", err)
+	}
+	m.destination = destination
+
+	instancePath := tools.GetInstancePath(m.dataDir, instanceName)
+
+	// Collect files to back up (skip missing gracefully)
+	filePaths := []string{
+		tools.GetKubeconfigPath(m.dataDir, instanceName),
+		tools.GetInstanceConfigPath(m.dataDir, instanceName),
+		tools.GetInstanceSecretsPath(m.dataDir, instanceName),
+		tools.GetTalosconfigPath(m.dataDir, instanceName),
+		filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "controlplane.yaml"),
+		filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "worker.yaml"),
+		filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "secrets.yaml"),
+	}
+
+	var existingFiles []string
+	for _, f := range filePaths {
+		if _, err := os.Stat(f); err == nil {
+			existingFiles = append(existingFiles, f)
+		}
+	}
+
+	if len(existingFiles) == 0 {
+		return nil, fmt.Errorf("no cluster config files found for instance %s", instanceName)
+	}
+
+	m.reportProgress(40, fmt.Sprintf("Archiving %d cluster config files", len(existingFiles)))
+
+	timestamp := time.Now().UTC().Format("20060102T150405Z")
+	key := fmt.Sprintf("cluster-config/%s/%s.tar.gz", instanceName, timestamp)
+
+	// Create tar.gz archive in memory
+	var buf bytes.Buffer
+	gzWriter := gzip.NewWriter(&buf)
+	tarWriter := tar.NewWriter(gzWriter)
+
+	totalSize := int64(0)
+	for _, filePath := range existingFiles {
+		file, err := os.Open(filePath)
+		if err != nil {
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to open %s: %w", filePath, err)
+		}
+
+		stat, err := file.Stat()
+		if err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to stat %s: %w", filePath, err)
+		}
+
+		header, err := tar.FileInfoHeader(stat, "")
+		if err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to create tar header for %s: %w", filePath, err)
+		}
+
+		// Use relative path from instance directory
+		relPath, _ := filepath.Rel(instancePath, filePath)
+		header.Name = relPath
+
+		if err := tarWriter.WriteHeader(header); err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to write tar header for %s: %w", filePath, err)
+		}
+
+		if _, err := io.Copy(tarWriter, file); err != nil {
+			file.Close()
+			tarWriter.Close()
+			gzWriter.Close()
+			return nil, fmt.Errorf("failed to write file %s to archive: %w", filePath, err)
+		}
+
+		totalSize += stat.Size()
+		file.Close()
+	}
+
+	if err := tarWriter.Close(); err != nil {
+		gzWriter.Close()
+		return nil, fmt.Errorf("failed to close tar: %w", err)
+	}
+	if err := gzWriter.Close(); err != nil {
+		return nil, fmt.Errorf("failed to close gzip: %w", err)
+	}
+
+	m.reportProgress(70, "Uploading cluster config backup")
+
+	reader := bytes.NewReader(buf.Bytes())
+	size, err := destination.Put(key, reader)
+	if err != nil {
+		return nil, fmt.Errorf("failed to upload cluster config backup: %w", err)
+	}
+
+	m.reportProgress(90, "Saving recovery plan")
+
+	now := time.Now()
+	completed := time.Now()
+	plan := &RecoveryPlan{
+		App:       "_cluster",
+		Instance:  instanceName,
+		Timestamp: timestamp,
+		Status:    "backed_up",
+		Strategies: []StrategyEntry{
+			{
+				Name:   "cluster-config",
+				Status: "backed_up",
+				Backup: map[string]interface{}{
+					"location":  key,
+					"size":      size,
+					"files":     len(existingFiles),
+					"format":    "tar.gz",
+					"totalSize": totalSize,
+				},
+			},
+		},
+		Phases: map[string]PhaseTime{
+			"backup": {StartedAt: &now, CompletedAt: &completed},
+		},
+	}
+
+	if err := m.savePlan(instanceName, "_cluster", timestamp, plan); err != nil {
+		return nil, fmt.Errorf("failed to save recovery plan: %w", err)
+	}
+
+	m.reportProgress(100, "Cluster config backup completed")
+	return plan, nil
+}
+
 // savePlan saves a RecoveryPlan to YAML file
 func (m *Manager) savePlan(instanceName, appName, timestamp string, plan *RecoveryPlan) error {
 	backupDir := filepath.Join(m.GetBackupDir(instanceName), appName, timestamp)
--- a/api/internal/backup/backup_test.go
+++ b/api/internal/backup/backup_test.go
@@ -808,6 +808,253 @@ func TestUpdatePVCVolumeBindingsNoLonghornStrategy(t *testing.T) {
 	assert.NoError(t, err)
 }

+func TestBackupAppInPlaceRestoreMode(t *testing.T) {
+	t.Run("sets RestoreMode to in-place when manifest declares it", func(t *testing.T) {
+		tempDir := t.TempDir()
+		instanceName := "test-instance"
+		appName := "postgres"
+		instanceDir := filepath.Join(tempDir, "instances", instanceName)
+		appsDir := filepath.Join(instanceDir, "apps", appName)
+		backupsDir := filepath.Join(instanceDir, "backups")
+		require.NoError(t, os.MkdirAll(appsDir, 0755))
+		require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+		manifestContent := `
+name: postgres
+description: PostgreSQL database
+version: 1.0.0-2
+backup:
+  restoreMode: in-place
+defaultConfig:
+  namespace: postgres
+`
+		require.NoError(t, os.WriteFile(filepath.Join(appsDir, "manifest.yaml"), []byte(manifestContent), 0644))
+
+		configContent := `
+backup:
+  destination:
+    type: local
+    local:
+      path: ` + backupsDir + `
+`
+		require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
+
+		mgr := NewManager(tempDir)
+		mgr.strategies = map[string]Strategy{
+			"config": &MockStrategy{Name_: "config"},
+		}
+
+		plan, err := mgr.BackupApp(instanceName, appName)
+		require.NoError(t, err)
+		assert.Equal(t, "in-place", plan.RestoreMode)
+	})
+
+	t.Run("leaves RestoreMode empty when manifest has no backup block", func(t *testing.T) {
+		tempDir := t.TempDir()
+		instanceName := "test-instance"
+		appName := "gitea"
+		instanceDir := filepath.Join(tempDir, "instances", instanceName)
+		appsDir := filepath.Join(instanceDir, "apps", appName)
+		backupsDir := filepath.Join(instanceDir, "backups")
+		require.NoError(t, os.MkdirAll(appsDir, 0755))
+		require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+		manifestContent := `
+name: gitea
+description: Gitea
+version: 1.0.0
+defaultConfig:
+  namespace: gitea
+`
+		require.NoError(t, os.WriteFile(filepath.Join(appsDir, "manifest.yaml"), []byte(manifestContent), 0644))
+
+		configContent := `
+backup:
+  destination:
+    type: local
+    local:
+      path: ` + backupsDir + `
+`
+		require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
+
+		mgr := NewManager(tempDir)
+		mgr.strategies = map[string]Strategy{
+			"config": &MockStrategy{Name_: "config"},
+		}
+
+		plan, err := mgr.BackupApp(instanceName, appName)
+		require.NoError(t, err)
+		assert.Equal(t, "", plan.RestoreMode)
+	})
+}
+
+func TestRestoreAppInPlace(t *testing.T) {
+	tempDir := t.TempDir()
+	instanceName := "test-instance"
+	appName := "postgres"
+	timestamp := "20240101T120000Z"
+
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
+	require.NoError(t, os.MkdirAll(backupDir, 0755))
+
+	// Config with backup destination
+	configContent := `
+backup:
+  destination:
+    type: local
+    local:
+      path: ` + filepath.Join(instanceDir, "backups") + `
+`
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
+
+	// Saved plan with in-place mode and backed_up status
+	plan := &btypes.RecoveryPlan{
+		App:         appName,
+		Instance:    instanceName,
+		Timestamp:   timestamp,
+		Status:      "backed_up",
+		RestoreMode: "in-place",
+		Source: btypes.RecoverySource{
+			ActiveColor: "blue",
+			Namespace:   appName,
+			AppDir:      filepath.Join("instances", instanceName, "apps", appName),
+		},
+		StandbyColor: "green",
+		Strategies: []btypes.StrategyEntry{
+			{Name: "config", Status: "backed_up"},
+		},
+		Phases: map[string]btypes.PhaseTime{},
+	}
+
+	data, _ := yaml.Marshal(plan)
+	require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
+
+	mgr := NewManager(tempDir)
+	mgr.strategies = map[string]Strategy{
+		"config": &MockStrategy{Name_: "config"},
+	}
+
+	restored, err := mgr.RestoreApp(instanceName, appName, RestoreOptions{})
+	require.NoError(t, err)
+	assert.Equal(t, "restored", restored.Status)
+	// In-place: standby namespace is the source namespace (not a colored standby)
+	assert.Equal(t, appName, restored.Standby.Namespace)
+	assert.Equal(t, plan.Source.AppDir, restored.Standby.AppDir)
+}
+
+func TestSwitchAppInPlaceUpdatesActiveDeployment(t *testing.T) {
+	tempDir := t.TempDir()
+	instanceName := "test-instance"
+	appName := "postgres"
+	timestamp := "20240101T120000Z"
+
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
+	require.NoError(t, os.MkdirAll(backupDir, 0755))
+
+	// Config with backup destination and existing app config
+	configContent := `
+apps:
+  postgres:
+    namespace: postgres
+backup:
+  destination:
+    type: local
+    local:
+      path: ` + filepath.Join(instanceDir, "backups") + `
+`
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
+
+	// Saved plan in restored state with in-place mode
+	plan := &btypes.RecoveryPlan{
+		App:          appName,
+		Instance:     instanceName,
+		Timestamp:    timestamp,
+		Status:       "restored",
+		RestoreMode:  "in-place",
+		StandbyColor: "green",
+		Source: btypes.RecoverySource{
+			ActiveColor: "blue",
+			Namespace:   appName,
+		},
+		Standby: btypes.RecoveryStandby{
+			Namespace: appName,
+		},
+		Strategies: []btypes.StrategyEntry{
+			{Name: "config", Status: "restored"},
+		},
+		Phases: map[string]btypes.PhaseTime{},
+	}
+
+	data, _ := yaml.Marshal(plan)
+	require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
+
+	mgr := NewManager(tempDir)
+	mgr.strategies = map[string]Strategy{
+		"config": &MockStrategy{Name_: "config"},
+	}
+
+	switched, err := mgr.SwitchApp(instanceName, appName)
+	require.NoError(t, err)
+	assert.Equal(t, "switched", switched.Status)
+
+	// activeDeployment should be updated even for in-place (tracks color for next restore)
+	color := mgr.getActiveDeployment(instanceName, appName)
+	assert.Equal(t, "green", color)
+}
+
+func TestCleanupAppInPlaceSkipsNamespaceDeletion(t *testing.T) {
+	tempDir := t.TempDir()
+	instanceName := "test-instance"
+	appName := "postgres"
+	timestamp := "20240101T120000Z"
+
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
+	require.NoError(t, os.MkdirAll(backupDir, 0755))
+
+	configContent := `
+backup:
+  destination:
+    type: local
+    local:
+      path: ` + filepath.Join(instanceDir, "backups") + `
+`
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
+
+	// Saved plan in switched state with in-place mode
+	plan := &btypes.RecoveryPlan{
+		App:          appName,
+		Instance:     instanceName,
+		Timestamp:    timestamp,
+		Status:       "switched",
+		RestoreMode:  "in-place",
+		StandbyColor: "green",
+		Source: btypes.RecoverySource{
+			ActiveColor: "blue",
+			Namespace:   appName,
+		},
+		Strategies: []btypes.StrategyEntry{
+			{Name: "config", Status: "switched"},
+		},
+		Phases: map[string]btypes.PhaseTime{},
+	}
+
+	data, _ := yaml.Marshal(plan)
+	require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
+
+	mgr := NewManager(tempDir)
+	mgr.strategies = map[string]Strategy{
+		"config": &MockStrategy{Name_: "config"},
+	}
+
+	// Should succeed without trying to run kubectl (no cluster access needed)
+	cleaned, err := mgr.CleanupApp(instanceName, appName)
+	require.NoError(t, err)
+	assert.Equal(t, "cleaned_up", cleaned.Status)
+}
+
 func TestIsDbNameEnvVar(t *testing.T) {
 	tests := []struct {
 		envName  string
--- a/api/internal/backup/cluster_backup_test.go
+++ b/api/internal/backup/cluster_backup_test.go
@@ -0,0 +1,188 @@
+package backup
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"io"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBackupClusterConfig(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+	require.NoError(t, os.MkdirAll(filepath.Join(instanceDir, "talos", "generated"), 0755))
+
+	// Create cluster config files
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "secrets.yaml"), []byte("secrets-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "talosconfig"), []byte("talosconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "controlplane.yaml"), []byte("controlplane-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "worker.yaml"), []byte("worker-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "secrets.yaml"), []byte("talos-secrets-data"), 0644))
+
+	mgr := NewManager(tempDir)
+	plan, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+	require.NotNil(t, plan)
+
+	assert.Equal(t, "_cluster", plan.App)
+	assert.Equal(t, instanceName, plan.Instance)
+	assert.Equal(t, "backed_up", plan.Status)
+	assert.Len(t, plan.Strategies, 1)
+	assert.Equal(t, "cluster-config", plan.Strategies[0].Name)
+	assert.Equal(t, "backed_up", plan.Strategies[0].Status)
+
+	// Check backup metadata
+	files, ok := plan.Strategies[0].Backup["files"].(int)
+	assert.True(t, ok)
+	assert.Equal(t, 7, files)
+
+	// Verify plan was saved to disk
+	planFile := filepath.Join(backupsDir, "_cluster", plan.Timestamp, "recovery-plan.yaml")
+	_, err = os.Stat(planFile)
+	assert.NoError(t, err, "recovery-plan.yaml should exist")
+}
+
+func TestBackupClusterConfigSkipsMissingFiles(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+	// Only create kubeconfig and config.yaml (no talos files)
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+
+	mgr := NewManager(tempDir)
+	plan, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+	require.NotNil(t, plan)
+
+	assert.Equal(t, "backed_up", plan.Status)
+	files, ok := plan.Strategies[0].Backup["files"].(int)
+	assert.True(t, ok)
+	assert.Equal(t, 2, files)
+}
+
+func TestBackupClusterConfigFailsWithNoFiles(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+	// Create only config.yaml for backup destination config, but none of the cluster files
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+
+	mgr := NewManager(tempDir)
+	_, err := mgr.BackupClusterConfig(instanceName)
+	// config.yaml itself is one of the files, so it will be found
+	// To truly have zero files, we need to remove config.yaml too,
+	// but then loadDestination fails first. So this test verifies
+	// that config.yaml IS included in the backup.
+	require.NoError(t, err)
+}
+
+func TestBackupClusterConfigArchiveContents(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+	require.NoError(t, os.MkdirAll(filepath.Join(instanceDir, "talos", "generated"), 0755))
+
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "talosconfig"), []byte("talosconfig-data"), 0644))
+
+	mgr := NewManager(tempDir)
+	plan, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+
+	// Read the archive from the local backup destination
+	location, ok := plan.Strategies[0].Backup["location"].(string)
+	require.True(t, ok)
+
+	archivePath := filepath.Join(backupsDir, location)
+	f, err := os.Open(archivePath)
+	require.NoError(t, err)
+	defer f.Close()
+
+	gzReader, err := gzip.NewReader(f)
+	require.NoError(t, err)
+	defer gzReader.Close()
+
+	tarReader := tar.NewReader(gzReader)
+
+	var fileNames []string
+	for {
+		header, err := tarReader.Next()
+		if err == io.EOF {
+			break
+		}
+		require.NoError(t, err)
+		fileNames = append(fileNames, header.Name)
+	}
+
+	assert.Contains(t, fileNames, "kubeconfig")
+	assert.Contains(t, fileNames, "config.yaml")
+	assert.Contains(t, fileNames, filepath.Join("talos", "generated", "talosconfig"))
+}
+
+func TestBackupClusterConfigListAndDelete(t *testing.T) {
+	tempDir := t.TempDir()
+
+	instanceName := "test-instance"
+	instanceDir := filepath.Join(tempDir, "instances", instanceName)
+	backupsDir := filepath.Join(instanceDir, "backups")
+
+	require.NoError(t, os.MkdirAll(backupsDir, 0755))
+
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
+	require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n  destination:\n    type: local\n    local:\n      path: "+backupsDir+"\n"), 0644))
+
+	mgr := NewManager(tempDir)
+
+	// Create two backups (sleep to ensure different timestamps)
+	plan1, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+	time.Sleep(1100 * time.Millisecond)
+	plan2, err := mgr.BackupClusterConfig(instanceName)
+	require.NoError(t, err)
+
+	// List backups
+	plans, err := mgr.ListBackups(instanceName, "_cluster")
+	require.NoError(t, err)
+	assert.Len(t, plans, 2)
+	// Newest first
+	assert.Equal(t, plan2.Timestamp, plans[0].Timestamp)
+	assert.Equal(t, plan1.Timestamp, plans[1].Timestamp)
+
+	// Delete one
+	err = mgr.DeleteAppBackup(instanceName, "_cluster", plan1.Timestamp)
+	require.NoError(t, err)
+
+	plans, err = mgr.ListBackups(instanceName, "_cluster")
+	require.NoError(t, err)
+	assert.Len(t, plans, 1)
+	assert.Equal(t, plan2.Timestamp, plans[0].Timestamp)
+}
--- a/api/internal/backup/config_loader.go
+++ b/api/internal/backup/config_loader.go
@@ -2,6 +2,7 @@ package backup

 import (
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -68,7 +69,7 @@ func LoadInstanceBackupConfig(dataDir, instanceName string) (*BackupConfiguratio
 	// Load credentials from secrets.yaml if needed
 	if err := loadBackupSecrets(dataDir, instanceName, config); err != nil {
 		// Secrets are optional, log but don't fail
-		fmt.Printf("Warning: failed to load backup secrets: %v\n", err)
+		slog.Error("failed to load backup secrets", "component", "backup", "error", err)
 	}

 	return config, nil
@@ -120,6 +121,63 @@ func SaveInstanceBackupSchedules(dataDir, instanceName string, schedules []Backu
 	return nil
 }

+// SaveInstanceBackupConfig writes the destination and retention sections of backup config.
+// Schedules are managed separately via SaveInstanceBackupSchedules.
+func SaveInstanceBackupConfig(dataDir, instanceName string, dest *DestinationConfig, retention *RetentionPolicy) error {
+	configPath := tools.GetInstanceConfigPath(dataDir, instanceName)
+
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return fmt.Errorf("failed to read config: %w", err)
+	}
+
+	var root map[string]interface{}
+	if err := yaml.Unmarshal(data, &root); err != nil {
+		return fmt.Errorf("failed to parse config: %w", err)
+	}
+
+	backupSection, ok := root["backup"].(map[string]interface{})
+	if !ok {
+		backupSection = make(map[string]interface{})
+		root["backup"] = backupSection
+	}
+
+	if dest != nil {
+		destData, err := yaml.Marshal(dest)
+		if err != nil {
+			return fmt.Errorf("failed to marshal destination: %w", err)
+		}
+		var destGeneric interface{}
+		if err := yaml.Unmarshal(destData, &destGeneric); err != nil {
+			return fmt.Errorf("failed to unmarshal destination: %w", err)
+		}
+		backupSection["destination"] = destGeneric
+	}
+
+	if retention != nil {
+		retData, err := yaml.Marshal(retention)
+		if err != nil {
+			return fmt.Errorf("failed to marshal retention: %w", err)
+		}
+		var retGeneric interface{}
+		if err := yaml.Unmarshal(retData, &retGeneric); err != nil {
+			return fmt.Errorf("failed to unmarshal retention: %w", err)
+		}
+		backupSection["retention"] = retGeneric
+	}
+
+	out, err := yaml.Marshal(root)
+	if err != nil {
+		return fmt.Errorf("failed to marshal config: %w", err)
+	}
+
+	if err := os.WriteFile(configPath, out, 0644); err != nil {
+		return fmt.Errorf("failed to write config: %w", err)
+	}
+
+	return nil
+}
+
 // loadBackupSecrets loads backup credentials from instance secrets.yaml
 func loadBackupSecrets(dataDir, instanceName string, config *BackupConfiguration) error {
 	secretsPath := filepath.Join(dataDir, "instances", instanceName, "secrets.yaml")
@@ -160,4 +218,4 @@ func loadBackupSecrets(dataDir, instanceName string, config *BackupConfiguration
 	}

 	return nil
-}
+}
--- a/api/internal/backup/destinations/azure.go
+++ b/api/internal/backup/destinations/azure.go
@@ -67,7 +67,7 @@ func (a *AzureDestination) Put(key string, reader io.Reader) (int64, error) {
 		blobURL,
 		azblob.UploadStreamToBlockBlobOptions{
 			BufferSize: 4 * 1024 * 1024, // 4MB buffer
-			MaxBuffers: 3,                // Limited for Raspberry Pi
+			MaxBuffers: 3,               // Limited for Raspberry Pi
 		},
 	)

@@ -208,4 +208,4 @@ func (a *AzureDestination) getCredential() azblob.StorageAccountCredential {
 	// as a field in the struct during initialization
 	// For now, return nil which means the SAS generation might fail
 	return nil
-}
+}
--- a/api/internal/backup/destinations/local.go
+++ b/api/internal/backup/destinations/local.go
@@ -3,6 +3,7 @@ package destinations
 import (
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
 	"path/filepath"
 	"time"
@@ -110,7 +111,7 @@ func (l *LocalDestination) List(prefix string) ([]btypes.BackupObject, error) {
 	err := filepath.Walk(searchPath, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			// Log error but continue walking
-			fmt.Printf("Warning: error walking path %s: %v\n", path, err)
+			slog.Error("error walking path", "component", "local", "path", path, "error", err)
 			return nil
 		}

@@ -190,4 +191,4 @@ func (l *LocalDestination) Cleanup(retention btypes.RetentionPolicy) error {
 	// This could implement retention policy enforcement
 	// For now, it's a no-op
 	return nil
-}
+}
--- a/api/internal/backup/destinations/local_test.go
+++ b/api/internal/backup/destinations/local_test.go
@@ -238,14 +238,14 @@ func TestLocalDestination_List(t *testing.T) {
 		require.NoError(t, os.WriteFile(fullPath, content, 0644))
 		// Set specific mod time for testing
 		modTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC)
-		os.Chtimes(fullPath, modTime, modTime)
+		require.NoError(t, os.Chtimes(fullPath, modTime, modTime))
 	}

 	tests := []struct {
-		name         string
-		prefix       string
-		expectCount  int
-		expectKeys   []string
+		name        string
+		prefix      string
+		expectCount int
+		expectKeys  []string
 	}{
 		{
 			name:        "list all",
@@ -357,4 +357,4 @@ func TestLocalDestination_GetDiskUsage(t *testing.T) {
 	usage, err = dest.GetDiskUsage()
 	assert.NoError(t, err)
 	assert.Equal(t, totalSize, usage)
-}
+}
--- a/api/internal/backup/destinations/nfs.go
+++ b/api/internal/backup/destinations/nfs.go
@@ -3,6 +3,7 @@ package destinations
 import (
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -29,6 +30,11 @@ func NewNFSDestination(cfg *btypes.NFSConfig) (*NFSDestination, error) {
 		mountPath = filepath.Join("/mnt/backup", strings.ReplaceAll(cfg.Server, ".", "-"), strings.ReplaceAll(cfg.Path, "/", "-"))
 	}

+	// Recover stale mount points (common after reboots or NFS server restarts)
+	if err := recoverStaleMountPoint(mountPath); err != nil {
+		return nil, fmt.Errorf("failed to recover stale mount point %s: %w", mountPath, err)
+	}
+
 	// Ensure mount point exists
 	if err := os.MkdirAll(mountPath, 0755); err != nil {
 		return nil, fmt.Errorf("failed to create mount point: %w", err)
@@ -53,13 +59,55 @@ func NewNFSDestination(cfg *btypes.NFSConfig) (*NFSDestination, error) {

 		output, err := cmd.CombinedOutput()
 		if err != nil {
-			return nil, fmt.Errorf("failed to mount NFS share: %w, output: %s", err, string(output))
+			return nil, fmt.Errorf("failed to mount NFS share %s:%s at %s: %w, output: %s",
+				cfg.Server, cfg.Path, mountPath, err, string(output))
 		}
 	}

 	return dest, nil
 }

+// recoverStaleMountPoint detects and cleans up stale NFS mounts.
+// After a reboot or NFS server restart, the mount point can have a stale file handle
+// that causes "file exists" errors on mkdir and stat. Force-unmounting fixes this.
+func recoverStaleMountPoint(mountPath string) error {
+	_, err := os.Stat(mountPath)
+	if err == nil {
+		// Path is accessible, nothing to recover
+		return nil
+	}
+	if os.IsNotExist(err) {
+		// Doesn't exist yet, nothing to recover
+		return nil
+	}
+
+	// Path exists but is inaccessible (stale file handle, transport endpoint not connected, etc.)
+	slog.Info("detected stale mount, attempting recovery", "component", "nfs", "mountPath", mountPath, "error", err)
+
+	// Try lazy unmount first (always succeeds), then force unmount
+	for _, flags := range [][]string{{"-l"}, {"-f"}} {
+		args := append([]string{"umount"}, flags...)
+		args = append(args, mountPath)
+		cmd := exec.Command("sudo", args...)
+		if output, umountErr := cmd.CombinedOutput(); umountErr != nil {
+			slog.Error("umount failed", "component", "nfs", "flags", flags, "mountPath", mountPath, "error", umountErr, "output", strings.TrimSpace(string(output)))
+		} else {
+			slog.Info("successfully unmounted stale mount", "component", "nfs", "mountPath", mountPath)
+			// After unmount, the directory might still exist but should be accessible now
+			if _, statErr := os.Stat(mountPath); statErr == nil || os.IsNotExist(statErr) {
+				return nil
+			}
+		}
+	}
+
+	// Last resort: remove and recreate the mount point
+	if rmErr := os.Remove(mountPath); rmErr != nil {
+		return fmt.Errorf("stale mount at %s could not be recovered (unmount and remove both failed): %w", mountPath, err)
+	}
+	slog.Info("removed stale mount point, will recreate", "component", "nfs", "mountPath", mountPath)
+	return nil
+}
+
 // Put uploads data to NFS, returns size written
 func (n *NFSDestination) Put(key string, reader io.Reader) (int64, error) {
 	fullPath := filepath.Join(n.mountPath, key)
@@ -185,4 +233,4 @@ func (n *NFSDestination) Cleanup() error {
 		}
 	}
 	return nil
-}
+}
--- a/api/internal/backup/destinations/s3.go
+++ b/api/internal/backup/destinations/s3.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"log/slog"
 	"time"

 	"github.com/aws/aws-sdk-go-v2/aws"
@@ -59,15 +60,16 @@ func (s *S3Destination) Put(key string, reader io.Reader) (int64, error) {
 	fullKey := s.getFullKey(key)

 	// Use S3 manager for efficient multipart uploads
-	uploader := manager.NewUploader(s.client, func(u *manager.Uploader) {
+	// TODO: migrate to feature/s3/transfermanager when stable
+	uploader := manager.NewUploader(s.client, func(u *manager.Uploader) { //nolint:staticcheck
 		u.PartSize = 10 * 1024 * 1024 // 10MB parts
-		u.Concurrency = 3              // Limited concurrency for Raspberry Pi
+		u.Concurrency = 3             // Limited concurrency for Raspberry Pi
 	})

 	// Create a custom reader that tracks bytes read
 	trackingReader := &sizeTrackingReader{reader: reader}

-	result, err := uploader.Upload(context.Background(), &s3.PutObjectInput{
+	result, err := uploader.Upload(context.Background(), &s3.PutObjectInput{ //nolint:staticcheck
 		Bucket: aws.String(s.bucket),
 		Key:    aws.String(fullKey),
 		Body:   trackingReader,
@@ -78,7 +80,7 @@ func (s *S3Destination) Put(key string, reader io.Reader) (int64, error) {
 	}

 	// Log the ETag for verification
-	fmt.Printf("Uploaded to S3: %s (ETag: %s)\n", fullKey, *result.ETag)
+	slog.Info("uploaded to S3", "component", "s3", "key", fullKey, "etag", *result.ETag)

 	return trackingReader.bytesRead, nil
 }
@@ -195,4 +197,4 @@ func (r *sizeTrackingReader) Read(p []byte) (int, error) {
 	n, err := r.reader.Read(p)
 	r.bytesRead += int64(n)
 	return n, err
-}
+}
--- a/api/internal/backup/retention.go
+++ b/api/internal/backup/retention.go
@@ -2,7 +2,7 @@ package backup

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"time"

 	btypes "github.com/wild-cloud/wild-central/daemon/internal/backup/types"
@@ -50,7 +50,7 @@ func EnforceRetention(mgr *Manager, instanceName, appName string, keepLast, keep

 		// Both policies say delete
 		if err := mgr.DeleteAppBackup(instanceName, appName, plan.Timestamp); err != nil {
-			log.Printf("Retention: failed to delete backup %s/%s/%s: %v", instanceName, appName, plan.Timestamp, err)
+			slog.Error("failed to delete backup", "component", "backup", "instance", instanceName, "app", appName, "timestamp", plan.Timestamp, "error", err)
 			continue
 		}
 		deleted++
--- a/api/internal/backup/retention_test.go
+++ b/api/internal/backup/retention_test.go
@@ -137,7 +137,9 @@ func TestEnforceRetention(t *testing.T) {

 	// Create instance config with local destination
 	instanceDir := filepath.Join(tmpDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	config := map[string]any{
 		"backup": map[string]any{
@@ -150,10 +152,14 @@ func TestEnforceRetention(t *testing.T) {
 		},
 	}
 	configData, _ := yaml.Marshal(config)
-	os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
+	if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	backupDir := filepath.Join(instanceDir, "backups", appName)
-	os.MkdirAll(backupDir, 0755)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	// Create test backup plans with different timestamps
 	now := time.Now().UTC()
@@ -162,7 +168,7 @@ func TestEnforceRetention(t *testing.T) {
 		age    time.Duration
 		status string
 	}{
-		{now.Format("20060102T150405Z"), 0, "backed_up"},                                  // newest
+		{now.Format("20060102T150405Z"), 0, "backed_up"},                                   // newest
 		{now.Add(-24 * time.Hour).Format("20060102T150405Z"), 24 * time.Hour, "backed_up"}, // 1 day old
 		{now.Add(-48 * time.Hour).Format("20060102T150405Z"), 48 * time.Hour, "backed_up"}, // 2 days old
 		{now.Add(-72 * time.Hour).Format("20060102T150405Z"), 72 * time.Hour, "backed_up"}, // 3 days old
@@ -171,7 +177,9 @@ func TestEnforceRetention(t *testing.T) {

 	for _, ts := range timestamps {
 		planDir := filepath.Join(backupDir, ts.ts)
-		os.MkdirAll(planDir, 0755)
+		if err := os.MkdirAll(planDir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		plan := btypes.RecoveryPlan{
 			App:       appName,
@@ -180,7 +188,9 @@ func TestEnforceRetention(t *testing.T) {
 			Status:    ts.status,
 		}
 		planData, _ := yaml.Marshal(plan)
-		os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
+		if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	mgr := NewManager(tmpDir)
@@ -212,7 +222,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
 	appName := "test-app"

 	instanceDir := filepath.Join(tmpDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	config := map[string]any{
 		"backup": map[string]any{
@@ -225,10 +237,14 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
 		},
 	}
 	configData, _ := yaml.Marshal(config)
-	os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
+	if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	backupDir := filepath.Join(instanceDir, "backups", appName)
-	os.MkdirAll(backupDir, 0755)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	now := time.Now().UTC()
 	backups := []struct {
@@ -243,7 +259,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {

 	for _, b := range backups {
 		planDir := filepath.Join(backupDir, b.ts)
-		os.MkdirAll(planDir, 0755)
+		if err := os.MkdirAll(planDir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		plan := btypes.RecoveryPlan{
 			App:       appName,
@@ -252,7 +270,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
 			Status:    b.status,
 		}
 		planData, _ := yaml.Marshal(plan)
-		os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
+		if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	mgr := NewManager(tmpDir)
@@ -281,7 +301,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 	appName := "test-app"

 	instanceDir := filepath.Join(tmpDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	config := map[string]any{
 		"backup": map[string]any{
@@ -294,10 +316,14 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 		},
 	}
 	configData, _ := yaml.Marshal(config)
-	os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
+	if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
+		t.Fatal(err)
+	}

 	backupDir := filepath.Join(instanceDir, "backups", appName)
-	os.MkdirAll(backupDir, 0755)
+	if err := os.MkdirAll(backupDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	now := time.Now().UTC()
 	// 5 backups: newest, 1h old, 2h old, 3h old, 25h old
@@ -306,7 +332,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 	for _, offset := range timestamps {
 		ts := now.Add(-offset).Format("20060102T150405Z")
 		planDir := filepath.Join(backupDir, ts)
-		os.MkdirAll(planDir, 0755)
+		if err := os.MkdirAll(planDir, 0755); err != nil {
+			t.Fatal(err)
+		}

 		plan := btypes.RecoveryPlan{
 			App:       appName,
@@ -315,7 +343,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
 			Status:    "backed_up",
 		}
 		planData, _ := yaml.Marshal(plan)
-		os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
+		if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
+			t.Fatal(err)
+		}
 	}

 	mgr := NewManager(tmpDir)
--- a/api/internal/backup/scheduler.go
+++ b/api/internal/backup/scheduler.go
@@ -3,7 +3,7 @@ package backup
 import (
 	"context"
 	"fmt"
-	"log"
+	"log/slog"
 	"sync"
 	"time"

@@ -36,7 +36,7 @@ func (s *Scheduler) Start() {
 	s.cancel = cancel

 	go s.loop(ctx)
-	log.Println("Backup scheduler started")
+	slog.Info("backup scheduler started", "component", "scheduler")
 }

 // Stop shuts down the scheduler
@@ -112,26 +112,36 @@ func (s *Scheduler) runSchedule(instanceName string, sched *btypes.BackupSchedul
 		s.mu.Unlock()
 	}()

-	log.Printf("Scheduler: running backup for %s/%s (schedule: %s)", instanceName, sched.TargetName, sched.Name)
+	slog.Info("running scheduled backup", "component", "scheduler", "instance", instanceName, "target", sched.TargetName, "schedule", sched.Name)

 	mgr := NewManager(s.dataDir)

-	if sched.TargetType == "app" {
-		_, err := mgr.BackupApp(instanceName, sched.TargetName)
-		if err != nil {
-			log.Printf("Scheduler: backup failed for %s/%s: %v", instanceName, sched.TargetName, err)
-		} else {
-			// Enforce retention after successful backup
-			keepLast, keepDays := RetentionFromSchedule(sched, config.Retention)
-			deleted, retErr := EnforceRetention(mgr, instanceName, sched.TargetName, keepLast, keepDays)
-			if retErr != nil {
-				log.Printf("Scheduler: retention enforcement failed for %s/%s: %v", instanceName, sched.TargetName, retErr)
-			} else if deleted > 0 {
-				log.Printf("Scheduler: retention cleaned up %d old backups for %s/%s", deleted, instanceName, sched.TargetName)
-			}
+	var backupErr error
+	var retentionTarget string
+
+	switch sched.TargetType {
+	case "app":
+		retentionTarget = sched.TargetName
+		_, backupErr = mgr.BackupApp(instanceName, sched.TargetName)
+	case "cluster":
+		retentionTarget = "_cluster"
+		_, backupErr = mgr.BackupClusterConfig(instanceName)
+	default:
+		slog.Error("unknown schedule target type", "component", "scheduler", "instance", instanceName, "targetType", sched.TargetType, "error", fmt.Errorf("unsupported target type: %s", sched.TargetType))
+		return
+	}
+
+	if backupErr != nil {
+		slog.Error("scheduled backup failed", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "error", backupErr)
+	} else {
+		keepLast, keepDays := RetentionFromSchedule(sched, config.Retention)
+		deleted, retErr := EnforceRetention(mgr, instanceName, retentionTarget, keepLast, keepDays)
+		if retErr != nil {
+			slog.Error("retention enforcement failed", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "error", retErr)
+		} else if deleted > 0 {
+			slog.Info("retention cleaned up old backups", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "deleted", deleted)
 		}
 	}
-	// TODO: cluster backup support

 	// Update lastRun and nextRun
 	now := time.Now()
@@ -144,7 +154,7 @@ func (s *Scheduler) runSchedule(instanceName string, sched *btypes.BackupSchedul

 func (s *Scheduler) saveSchedules(instanceName string, config *BackupConfiguration) {
 	if err := SaveInstanceBackupSchedules(s.dataDir, instanceName, config.Schedules); err != nil {
-		log.Printf("Scheduler: failed to save schedules for %s: %v", instanceName, err)
+		slog.Error("failed to save schedules", "component", "scheduler", "instance", instanceName, "error", err)
 	}
 }

--- a/api/internal/backup/scheduler_test.go
+++ b/api/internal/backup/scheduler_test.go
@@ -20,7 +20,7 @@ func TestParseTime(t *testing.T) {
 		{"14:30", 14, 30},
 		{"00:00", 0, 0},
 		{"23:59", 23, 59},
-		{"", 2, 0},       // default
+		{"", 2, 0},        // default
 		{"invalid", 2, 0}, // default
 		{"25:00", 25, 0},  // parses but invalid hour (not our concern here)
 	}
@@ -148,10 +148,14 @@ func TestSaveInstanceBackupSchedules(t *testing.T) {
 	dataDir := t.TempDir()
 	instanceName := "test-instance"
 	instanceDir := filepath.Join(dataDir, "instances", instanceName)
-	os.MkdirAll(instanceDir, 0755)
+	if err := os.MkdirAll(instanceDir, 0755); err != nil {
+		t.Fatal(err)
+	}

 	configPath := filepath.Join(instanceDir, "config.yaml")
-	os.WriteFile(configPath, []byte("cloud:\n  domain: test.local\n"), 0644)
+	if err := os.WriteFile(configPath, []byte("cloud:\n  domain: test.local\n"), 0644); err != nil {
+		t.Fatal(err)
+	}

 	now := time.Now()
 	schedules := []BackupSchedule{
@@ -180,7 +184,9 @@ func TestSaveInstanceBackupSchedules(t *testing.T) {
 	}

 	var root map[string]interface{}
-	yaml.Unmarshal(data, &root)
+	if err := yaml.Unmarshal(data, &root); err != nil {
+		t.Fatalf("Unmarshal error = %v", err)
+	}

 	// Verify cloud.domain is preserved
 	cloud, ok := root["cloud"].(map[string]interface{})
--- a/api/internal/backup/strategies/config.go
+++ b/api/internal/backup/strategies/config.go
@@ -425,7 +425,7 @@ func (c *ConfigStrategy) mergeConfig(reader io.Reader, instancePath, appName str

 	var config map[string]interface{}
 	if data, err := os.ReadFile(configPath); err == nil {
-		yaml.Unmarshal(data, &config)
+		_ = yaml.Unmarshal(data, &config)
 	}
 	if config == nil {
 		config = make(map[string]interface{})
@@ -461,7 +461,7 @@ func (c *ConfigStrategy) mergeSecrets(reader io.Reader, instancePath, appName st

 	var secrets map[string]interface{}
 	if data, err := os.ReadFile(secretsPath); err == nil {
-		yaml.Unmarshal(data, &secrets)
+		_ = yaml.Unmarshal(data, &secrets)
 	}
 	if secrets == nil {
 		secrets = make(map[string]interface{})
--- a/api/internal/backup/strategies/longhorn_native.go
+++ b/api/internal/backup/strategies/longhorn_native.go
@@ -1,9 +1,8 @@
 package strategies

 import (
-	"bytes"
-	"encoding/json"
 	"fmt"
+	"log/slog"
 	"os/exec"
 	"strings"
 	"time"
@@ -32,30 +31,6 @@ func (l *LonghornNativeStrategy) Name() string {
 	return "longhorn-native"
 }

-// LonghornBackup represents a Longhorn Backup CRD
-type LonghornBackup struct {
-	APIVersion string `json:"apiVersion"`
-	Kind       string `json:"kind"`
-	Metadata   struct {
-		Name      string            `json:"name"`
-		Namespace string            `json:"namespace"`
-		Labels    map[string]string `json:"labels"`
-	} `json:"metadata"`
-	Spec struct {
-		SnapshotName string            `json:"snapshotName"`
-		Labels       map[string]string `json:"labels"`
-	} `json:"spec"`
-	Status struct {
-		State           string            `json:"state"`
-		Progress        int               `json:"progress"`
-		URL             string            `json:"url"`
-		VolumeSize      string            `json:"volumeSize"`
-		VolumeCreatedAt string            `json:"volumeCreatedAt"`
-		Messages        map[string]string `json:"messages"`
-		Error           string            `json:"error"`
-	} `json:"status"`
-}
-
 // Backup creates Longhorn native backups of all PVCs for an app, writing results to the plan
 func (l *LonghornNativeStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.BackupDestination) error {
 	entry := plan.GetStrategyEntry("longhorn-native")
@@ -129,7 +104,9 @@ func (l *LonghornNativeStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.B
 			"backupURL": backupURL,
 		})

-		l.cleanupOldBackups(kubeconfigPath, volumeName, backupID)
+		if err := l.cleanupOldBackups(kubeconfigPath, volumeName, backupID); err != nil {
+			slog.Error("failed to clean up old backups", "component", "longhorn", "volume", volumeName, "error", err)
+		}
 	}

 	// Record in plan
@@ -164,11 +141,6 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 		return nil
 	}

-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return fmt.Errorf("failed to get Longhorn API endpoint: %w", err)
-	}
-
 	restoreVolumes := []map[string]any{}

 	for _, bv := range backupVolumes {
@@ -205,7 +177,7 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 		// Create colored restore volume name
 		restoreVolumeName := fmt.Sprintf("%s-%s", pvcName, plan.StandbyColor)

-		if err := l.createVolumeFromBackup(kubeconfigPath, apiURL, restoreVolumeName, backupURL, pvcSize); err != nil {
+		if err := l.createVolumeFromBackup(kubeconfigPath, restoreVolumeName, backupURL, pvcSize); err != nil {
 			return fmt.Errorf("failed to create volume from backup for %s: %w", pvcName, err)
 		}

@@ -215,7 +187,7 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 			standbyNamespace = plan.App + "-" + plan.StandbyColor
 		}
 		if err := l.createPVForVolume(kubeconfigPath, restoreVolumeName, pvcSize, accessMode, standbyNamespace, pvcName); err != nil {
-			fmt.Printf("Warning: failed to create PV for volume %s: %v\n", restoreVolumeName, err)
+			slog.Error("failed to create PV for volume", "component", "longhorn", "volume", restoreVolumeName, "error", err)
 		}

 		restoreVolumes = append(restoreVolumes, map[string]any{
@@ -231,7 +203,8 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
 	return nil
 }

-// Switch records previous active volume names
+// Switch either records previous active volume names (standby mode) or performs
+// an in-place PVC swap: scale down → delete old PVC/PV/volume → create new PV+PVC → scale up.
 func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
 	entry := plan.GetStrategyEntry("longhorn-native")
 	if entry == nil {
@@ -239,6 +212,15 @@ func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
 	}
 	entry.Status = "switching"

+	if plan.RestoreMode == "in-place" {
+		if err := l.switchInPlace(plan, entry); err != nil {
+			return err
+		}
+		entry.Status = "switched"
+		return nil
+	}
+
+	// Standby mode: record previous active volume names for Cleanup phase
 	previousVolumes := []map[string]any{}
 	if volumeParams, ok := entry.Params["volumes"].([]any); ok {
 		for _, vp := range volumeParams {
@@ -258,7 +240,136 @@ func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
 	return nil
 }

-// Cleanup deletes the previous active-color Longhorn volumes
+// switchInPlace performs a PVC swap in the original namespace:
+// scale down → delete old PVC/PV/Longhorn volume → create new PV+PVC → scale up.
+func (l *LonghornNativeStrategy) switchInPlace(plan *btypes.RecoveryPlan, entry *btypes.StrategyEntry) error {
+	kubeconfigPath := tools.GetKubeconfigPath(l.dataDir, plan.Instance)
+	namespace := plan.Source.Namespace
+
+	restoreVolumes, ok := entry.Restore["volumes"].([]any)
+	if !ok || len(restoreVolumes) == 0 {
+		// No PVCs to swap (e.g., memcached)
+		slog.Info("no volumes to swap for in-place restore, skipping PVC swap", "component", "longhorn", "namespace", namespace)
+		return l.bounceNamespace(kubeconfigPath, namespace)
+	}
+
+	// Build a map of pvcName → restored volume name for quick lookup
+	restoredVolumeByPVC := map[string]string{}
+	for _, rv := range restoreVolumes {
+		if rvMap, ok := rv.(map[string]any); ok {
+			pvcName, _ := rvMap["pvcName"].(string)
+			volumeName, _ := rvMap["volumeName"].(string)
+			if pvcName != "" && volumeName != "" {
+				restoredVolumeByPVC[pvcName] = volumeName
+			}
+		}
+	}
+
+	// Collect PVC metadata from Params for size/accessMode
+	pvcParams := map[string]map[string]any{}
+	if volumeParams, ok := entry.Params["volumes"].([]any); ok {
+		for _, vp := range volumeParams {
+			if vpMap, ok := vp.(map[string]any); ok {
+				if pvcName, ok := vpMap["pvcName"].(string); ok {
+					pvcParams[pvcName] = vpMap
+				}
+			}
+		}
+	}
+
+	// Step 1: Scale down all workloads in the namespace
+	slog.Info("scaling down workloads for in-place restore", "component", "longhorn", "namespace", namespace)
+	if err := l.scaleNamespace(kubeconfigPath, namespace, 0); err != nil {
+		return fmt.Errorf("failed to scale down namespace %s: %w", namespace, err)
+	}
+	if err := l.waitForPodsGone(kubeconfigPath, namespace); err != nil {
+		return fmt.Errorf("timed out waiting for pods to terminate in %s: %w", namespace, err)
+	}
+
+	// Step 2: For each PVC, record old PV/volume then swap
+	previousItems := []map[string]any{}
+
+	for pvcName, restoredVolumeName := range restoredVolumeByPVC {
+		// Get current PV name and Longhorn volume handle
+		pvName, longhornVolume, err := l.getPVInfo(kubeconfigPath, namespace, pvcName)
+		if err != nil {
+			slog.Error("failed to get PV info, skipping PVC", "component", "longhorn", "pvc", pvcName, "error", err)
+		}
+
+		previousItems = append(previousItems, map[string]any{
+			"pvcName":       pvcName,
+			"pvName":        pvName,
+			"longhornVolume": longhornVolume,
+		})
+
+		// Delete the PVC (may cascade to PV deletion if Delete reclaimPolicy)
+		slog.Info("deleting PVC for in-place swap", "component", "longhorn", "pvc", pvcName, "namespace", namespace)
+		deleteCmd := exec.Command("kubectl", "delete", "pvc", pvcName, "-n", namespace, "--ignore-not-found", "--timeout=60s")
+		tools.WithKubeconfig(deleteCmd, kubeconfigPath)
+		if output, err := deleteCmd.CombinedOutput(); err != nil {
+			return fmt.Errorf("failed to delete PVC %s: %w, output: %s", pvcName, err, output)
+		}
+
+		// Clean up old PV if it still exists (Retain policy from a prior in-place restore)
+		if pvName != "" {
+			if err := l.deletePVIfExists(kubeconfigPath, pvName); err != nil {
+				slog.Error("failed to delete old PV", "component", "longhorn", "pv", pvName, "error", err)
+			}
+		}
+
+		// Clean up old Longhorn volume if it still exists and was a restore volume
+		// (colored name pattern: pvcName-blue or pvcName-green)
+		if longhornVolume != "" && (strings.HasSuffix(longhornVolume, "-blue") || strings.HasSuffix(longhornVolume, "-green")) {
+			if err := l.deleteLonghornVolumeIfExists(kubeconfigPath, longhornVolume); err != nil {
+				slog.Error("failed to delete old Longhorn volume", "component", "longhorn", "volume", longhornVolume, "error", err)
+			}
+		}
+
+		// Get size and access mode for the new PVC
+		size := "10Gi"
+		accessMode := "ReadWriteOnce"
+		if params, ok := pvcParams[pvcName]; ok {
+			if s, ok := params["size"].(string); ok && s != "" {
+				size = s
+			}
+			if m, ok := params["accessMode"].(string); ok && m != "" {
+				accessMode = m
+			}
+		}
+
+		// Create new PV pointing to the restored Longhorn volume
+		slog.Info("creating new PV for restored volume", "component", "longhorn", "volume", restoredVolumeName, "pvc", pvcName)
+		if err := l.createPVForVolume(kubeconfigPath, restoredVolumeName, size, accessMode, namespace, pvcName); err != nil {
+			return fmt.Errorf("failed to create PV for volume %s: %w", restoredVolumeName, err)
+		}
+
+		// Create new PVC bound to the new PV
+		slog.Info("creating new PVC bound to restored volume", "component", "longhorn", "pvc", pvcName, "volume", restoredVolumeName)
+		if err := l.createPVC(kubeconfigPath, pvcName, namespace, restoredVolumeName, size, accessMode); err != nil {
+			return fmt.Errorf("failed to create PVC %s: %w", pvcName, err)
+		}
+
+		// Wait for PVC to be Bound
+		if err := l.waitForPVCBound(kubeconfigPath, namespace, pvcName); err != nil {
+			return fmt.Errorf("PVC %s did not reach Bound state: %w", pvcName, err)
+		}
+	}
+
+	entry.Switch = map[string]any{
+		"previousItems": previousItems,
+	}
+
+	// Step 3: Scale back up
+	slog.Info("scaling workloads back up after in-place restore", "component", "longhorn", "namespace", namespace)
+	if err := l.scaleNamespace(kubeconfigPath, namespace, 1); err != nil {
+		return fmt.Errorf("failed to scale up namespace %s: %w", namespace, err)
+	}
+
+	return nil
+}
+
+// Cleanup deletes the previous active-color Longhorn volumes (standby mode).
+// For in-place mode this is a no-op — the old PVC/PV/volume were cleaned up during Switch.
 func (l *LonghornNativeStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
 	entry := plan.GetStrategyEntry("longhorn-native")
 	if entry == nil {
@@ -266,7 +377,8 @@ func (l *LonghornNativeStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
 	}
 	entry.Status = "cleaning_up"

-	// Skip automatic volume cleanup — volumes may still be referenced by PVCs.
+	// In-place: nothing to clean up here (handled during Switch)
+	// Standby: skip automatic volume cleanup — volumes may still be referenced by PVCs.
 	// Manual cleanup or a separate garbage collection process is safer.

 	entry.Status = "cleaned_up"
@@ -287,27 +399,33 @@ func (l *LonghornNativeStrategy) Verify(plan *btypes.RecoveryPlan, dest btypes.B
 		return nil
 	}

-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return fmt.Errorf("failed to get Longhorn API endpoint: %w", err)
+	// Verify backup target is accessible
+	if err := l.checkBackupTarget(kubeconfigPath); err != nil {
+		return fmt.Errorf("backup target not accessible: %w", err)
 	}

+	// Verify each backup CRD still exists
 	for _, bv := range backupVolumes {
 		backup, ok := bv.(map[string]any)
 		if !ok {
 			continue
 		}

-		backupURL, _ := backup["backupURL"].(string)
-		if backupURL == "" {
+		backupID, _ := backup["backupID"].(string)
+		if backupID == "" {
 			continue
 		}

-		url := fmt.Sprintf("%s/v1/volumes", apiURL)
-		cmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", url)
+		cmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupID,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.state}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)
+
 		output, err := cmd.Output()
-		if err != nil || string(output) != "200" {
-			return fmt.Errorf("Longhorn API not accessible")
+		if err != nil {
+			return fmt.Errorf("backup %s not found: %w", backupID, err)
+		}
+		if string(output) != "Completed" {
+			return fmt.Errorf("backup %s is not in Completed state: %s", backupID, string(output))
 		}
 	}

@@ -322,7 +440,7 @@ func (l *LonghornNativeStrategy) backupVolumeWithRetry(kubeconfigPath, appName,
 		snapshotName := strings.ToLower(fmt.Sprintf("%s-%s-snapshot-%s", appName, pvcName, timestamp))
 		if attempt > 0 {
 			snapshotName = strings.ToLower(fmt.Sprintf("%s-%s-snapshot-%s-retry%d", appName, pvcName, timestamp, attempt))
-			fmt.Printf("Retrying backup for volume %s (attempt %d/%d)...\n", volumeName, attempt+1, maxAttempts)
+			slog.Info("retrying backup for volume", "component", "longhorn", "volume", volumeName, "attempt", attempt+1, "maxAttempts", maxAttempts)
 			time.Sleep(10 * time.Second)
 		}

@@ -418,147 +536,117 @@ func (l *LonghornNativeStrategy) getVolumeNameFromPVC(kubeconfigPath, namespace,
 	return volumeName, nil
 }

-func (l *LonghornNativeStrategy) getLonghornAPIEndpoint(kubeconfigPath string) (string, error) {
-	checkCmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "http://localhost:8080/v1/volumes")
-	if err := checkCmd.Run(); err == nil {
-		return "http://localhost:8080", nil
-	}
-
-	cmd := exec.Command("kubectl", "port-forward", "-n", "longhorn-system", "service/longhorn-frontend", "8080:80")
-	tools.WithKubeconfig(cmd, kubeconfigPath)
-
-	if err := cmd.Start(); err != nil {
-		return "", fmt.Errorf("failed to start port-forward: %w", err)
-	}
-
-	time.Sleep(3 * time.Second)
-
-	verifyCmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "http://localhost:8080/v1/volumes")
-	if err := verifyCmd.Run(); err != nil {
-		return "", fmt.Errorf("port-forward not responding after setup: %w", err)
-	}
-
-	return "http://localhost:8080", nil
-}
-
 func (l *LonghornNativeStrategy) createSnapshot(kubeconfigPath, volumeName, snapshotName string) error {
-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return err
+	snapshotYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
+kind: Snapshot
+metadata:
+  name: %s
+  namespace: longhorn-system
+spec:
+  volume: %s
+  createSnapshot: true
+`, snapshotName, volumeName)
+
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(snapshotYAML)
+
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to create snapshot: %w, output: %s", err, string(output))
 	}

-	url := fmt.Sprintf("%s/v1/volumes/%s?action=snapshotCreate", apiURL, volumeName)
-	payload := fmt.Sprintf(`{"name":"%s"}`, snapshotName)
+	// Wait for snapshot to be ready
+	for range 30 {
+		cmd := exec.Command("kubectl", "get", "snapshots.longhorn.io", snapshotName,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.readyToUse}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)

-	cmd := exec.Command("curl", "-X", "POST", url,
-		"-H", "Content-Type: application/json",
-		"-d", payload, "-s")
-
-	if err := cmd.Run(); err != nil {
-		return fmt.Errorf("failed to create snapshot: %w", err)
+		output, err := cmd.Output()
+		if err == nil && string(output) == "true" {
+			return nil
+		}
+		time.Sleep(2 * time.Second)
 	}

-	time.Sleep(2 * time.Second)
-	return nil
+	return fmt.Errorf("timeout waiting for snapshot %s to be ready", snapshotName)
 }

 func (l *LonghornNativeStrategy) createBackup(kubeconfigPath, volumeName, snapshotName string) (string, error) {
-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return "", err
+	// Backup name must be unique — derive from snapshot name
+	backupName := strings.ReplaceAll(snapshotName, "_", "-")
+	if len(backupName) > 63 {
+		backupName = backupName[:63]
 	}

-	url := fmt.Sprintf("%s/v1/volumes/%s?action=snapshotBackup", apiURL, volumeName)
-	payload := fmt.Sprintf(`{"name":"%s"}`, snapshotName)
+	backupYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
+kind: Backup
+metadata:
+  name: %s
+  namespace: longhorn-system
+  labels:
+    backup-volume: %s
+spec:
+  snapshotName: %s
+`, backupName, volumeName, snapshotName)

-	cmd := exec.Command("curl", "-X", "POST", url,
-		"-H", "Content-Type: application/json",
-		"-d", payload, "-s")
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(backupYAML)

-	output, err := cmd.Output()
-	if err != nil {
-		return "", fmt.Errorf("failed to create backup: %w", err)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return "", fmt.Errorf("failed to create backup: %w, output: %s", err, string(output))
 	}

-	var response map[string]any
-	if err := json.Unmarshal(output, &response); err != nil {
-		return "", fmt.Errorf("failed to parse backup response: %w", err)
-	}
-
-	if backupStatus, ok := response["backupStatus"].([]any); ok {
-		// Find the backup entry matching our snapshot
-		for _, bs := range backupStatus {
-			if status, ok := bs.(map[string]any); ok {
-				if snap, _ := status["snapshot"].(string); snap == snapshotName {
-					if id, ok := status["id"].(string); ok {
-						return id, nil
-					}
-				}
-			}
-		}
-		// Fallback: find any entry without an error (new backup in progress)
-		for _, bs := range backupStatus {
-			if status, ok := bs.(map[string]any); ok {
-				if errMsg, _ := status["error"].(string); errMsg == "" {
-					if id, ok := status["id"].(string); ok {
-						return id, nil
-					}
-				}
-			}
-		}
-	}
-
-	return "", fmt.Errorf("backup ID not found in response for snapshot %s", snapshotName)
+	return backupName, nil
 }

-func (l *LonghornNativeStrategy) waitForBackupComplete(kubeconfigPath, volumeName, backupID string) (string, error) {
-	apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
-	if err != nil {
-		return "", err
-	}
-
+func (l *LonghornNativeStrategy) waitForBackupComplete(kubeconfigPath, _, backupName string) (string, error) {
 	maxRetries := 120
-	for range maxRetries {
-		url := fmt.Sprintf("%s/v1/volumes/%s", apiURL, volumeName)
-		cmd := exec.Command("curl", "-s", url)
+	for i := range maxRetries {
+		// Get backup state
+		stateCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.state}")
+		tools.WithKubeconfig(stateCmd, kubeconfigPath)

-		output, err := cmd.Output()
+		stateOutput, err := stateCmd.Output()
 		if err != nil {
 			time.Sleep(5 * time.Second)
 			continue
 		}

-		var volume map[string]any
-		if err := json.Unmarshal(output, &volume); err != nil {
-			time.Sleep(5 * time.Second)
-			continue
+		state := string(stateOutput)
+
+		if state == "Error" {
+			// Get error message
+			errCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
+				"-n", "longhorn-system", "-o", "jsonpath={.status.messages}")
+			tools.WithKubeconfig(errCmd, kubeconfigPath)
+			errOutput, _ := errCmd.Output()
+			return "", fmt.Errorf("backup failed: %s", string(errOutput))
 		}

-		if backupStatus, ok := volume["backupStatus"].([]any); ok {
-			for _, status := range backupStatus {
-				if s, ok := status.(map[string]any); ok {
-					if id, _ := s["id"].(string); id == backupID {
-						if state, _ := s["state"].(string); state == "Completed" {
-							if backupURL, ok := s["backupURL"].(string); ok && backupURL != "" {
-								return backupURL, nil
-							}
-							return l.getBackupURL(volumeName, backupID)
-						}
-						if errorMsg, _ := s["error"].(string); errorMsg != "" {
-							return "", fmt.Errorf("backup failed: %s", errorMsg)
-						}
-					}
-				}
+		if state == "Completed" {
+			// Get backup URL
+			urlCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
+				"-n", "longhorn-system", "-o", "jsonpath={.status.url}")
+			tools.WithKubeconfig(urlCmd, kubeconfigPath)
+
+			urlOutput, err := urlCmd.Output()
+			if err != nil {
+				return "", fmt.Errorf("backup completed but failed to get URL: %w", err)
+			}
+			backupURL := string(urlOutput)
+			if backupURL != "" {
+				return backupURL, nil
 			}
 		}

+		if i%12 == 0 && i > 0 {
+			slog.Info("waiting for backup to complete", "component", "longhorn", "backup", backupName, "state", state, "attempt", i)
+		}
 		time.Sleep(5 * time.Second)
 	}
-	return "", fmt.Errorf("timeout waiting for backup to complete")
-}
-
-func (l *LonghornNativeStrategy) getBackupURL(volumeName, backupID string) (string, error) {
-	return fmt.Sprintf("backup://%s/%s", volumeName, backupID), nil
+	return "", fmt.Errorf("timeout waiting for backup %s to complete", backupName)
 }

 func (l *LonghornNativeStrategy) createPVForVolume(kubeconfigPath, volumeName, size, accessMode, namespace, pvcName string) error {
@@ -592,9 +680,7 @@ spec:
 	return nil
 }

-func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, apiURL, volumeName, backupURL, size string) error {
-	url := fmt.Sprintf("%s/v1/volumes", apiURL)
-
+func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, volumeName, backupURL, size string) error {
 	sizeBytes := "1073741824"
 	if strings.HasSuffix(size, "Gi") {
 		var sizeInt int
@@ -603,64 +689,191 @@ func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, apiURL,
 		}
 	}

-	payload := fmt.Sprintf(`{
-		"name": "%s",
-		"size": "%s",
-		"fromBackup": "%s",
-		"numberOfReplicas": 3
-	}`, volumeName, sizeBytes, backupURL)
+	volumeYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
+kind: Volume
+metadata:
+  name: %s
+  namespace: longhorn-system
+spec:
+  size: "%s"
+  fromBackup: "%s"
+  numberOfReplicas: 3
+  frontend: blockdev
+  accessMode: rwo
+`, volumeName, sizeBytes, backupURL)

-	cmd := exec.Command("curl", "-X", "POST", url,
-		"-H", "Content-Type: application/json",
-		"-d", payload, "-s")
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(volumeYAML)

-	var stdout, stderr bytes.Buffer
-	cmd.Stdout = &stdout
-	cmd.Stderr = &stderr
-
-	if err := cmd.Run(); err != nil {
-		return fmt.Errorf("failed to create volume from backup: %w, stderr: %s, stdout: %s", err, stderr.String(), stdout.String())
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to create volume from backup: %w, output: %s", err, string(output))
 	}

-	return l.waitForVolume(kubeconfigPath, apiURL, volumeName)
+	return l.waitForVolume(kubeconfigPath, volumeName)
 }

-func (l *LonghornNativeStrategy) waitForVolume(_, apiURL, volumeName string) error {
+func (l *LonghornNativeStrategy) waitForVolume(kubeconfigPath, volumeName string) error {
 	maxRetries := 60
 	for i := range maxRetries {
-		url := fmt.Sprintf("%s/v1/volumes/%s", apiURL, volumeName)
-		cmd := exec.Command("curl", "-s", url)
+		cmd := exec.Command("kubectl", "get", "volumes.longhorn.io", volumeName,
+			"-n", "longhorn-system", "-o", "jsonpath={.status.state},{.status.restoreInitiated},{.status.robustness}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)

 		output, err := cmd.Output()
 		if err == nil {
-			var volume map[string]any
-			if err := json.Unmarshal(output, &volume); err == nil {
-				if state, _ := volume["state"].(string); state == "detached" || state == "attached" {
-					if restoreStatus, ok := volume["restoreStatus"].([]any); ok && len(restoreStatus) > 0 {
-						for _, rs := range restoreStatus {
-							if status, ok := rs.(map[string]any); ok {
-								if isRestored, _ := status["isRestored"].(bool); isRestored {
-									return nil
-								}
-							}
-						}
-					} else {
-						if robustness, _ := volume["robustness"].(string); robustness == "healthy" || robustness == "unknown" {
-							return nil
-						}
+			parts := strings.Split(string(output), ",")
+			if len(parts) == 3 {
+				state := parts[0]
+				restoreInitiated := parts[1]
+				robustness := parts[2]
+
+				if state == "detached" || state == "attached" {
+					if restoreInitiated == "true" {
+						return nil
+					}
+					if robustness == "healthy" || robustness == "unknown" {
+						return nil
 					}
 				}
 			}
 		}

 		if i%12 == 0 {
-			fmt.Printf("Waiting for volume %s to be ready... (%d/%d)\n", volumeName, i, maxRetries)
+			slog.Info("waiting for volume to be ready", "component", "longhorn", "volume", volumeName, "attempt", i, "maxRetries", maxRetries)
 		}
 		time.Sleep(5 * time.Second)
 	}
-	return fmt.Errorf("timeout waiting for volume to be ready")
+	return fmt.Errorf("timeout waiting for volume %s to be ready", volumeName)
 }

 func (l *LonghornNativeStrategy) cleanupOldBackups(_, _, _ string) error {
 	return nil
 }
+
+// bounceNamespace scales a namespace down then back up (used for stateless in-place restores like memcached).
+func (l *LonghornNativeStrategy) bounceNamespace(kubeconfigPath, namespace string) error {
+	if err := l.scaleNamespace(kubeconfigPath, namespace, 0); err != nil {
+		return fmt.Errorf("failed to scale down namespace %s: %w", namespace, err)
+	}
+	if err := l.waitForPodsGone(kubeconfigPath, namespace); err != nil {
+		return fmt.Errorf("timed out waiting for pods to terminate in %s: %w", namespace, err)
+	}
+	return l.scaleNamespace(kubeconfigPath, namespace, 1)
+}
+
+// scaleNamespace sets replicas on all Deployments and StatefulSets in a namespace.
+func (l *LonghornNativeStrategy) scaleNamespace(kubeconfigPath, namespace string, replicas int) error {
+	for _, kind := range []string{"deployment", "statefulset"} {
+		cmd := exec.Command("kubectl", "scale", kind, "--all", "-n", namespace,
+			fmt.Sprintf("--replicas=%d", replicas))
+		tools.WithKubeconfig(cmd, kubeconfigPath)
+		if output, err := cmd.CombinedOutput(); err != nil {
+			if !strings.Contains(string(output), "no resources found") {
+				return fmt.Errorf("failed to scale %s in %s: %w, output: %s", kind, namespace, err, output)
+			}
+		}
+	}
+	return nil
+}
+
+// waitForPodsGone waits until all pods in a namespace have terminated.
+func (l *LonghornNativeStrategy) waitForPodsGone(kubeconfigPath, namespace string) error {
+	cmd := exec.Command("kubectl", "wait", "--for=delete", "pod", "--all",
+		"-n", namespace, "--timeout=120s")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		if strings.Contains(string(output), "no matching resources found") {
+			return nil
+		}
+		return fmt.Errorf("waiting for pods to terminate in %s: %w, output: %s", namespace, err, output)
+	}
+	return nil
+}
+
+// getPVInfo returns the PV name and Longhorn volume handle for a PVC.
+func (l *LonghornNativeStrategy) getPVInfo(kubeconfigPath, namespace, pvcName string) (string, string, error) {
+	pvNameCmd := exec.Command("kubectl", "get", "pvc", pvcName, "-n", namespace,
+		"-o", "jsonpath={.spec.volumeName}")
+	tools.WithKubeconfig(pvNameCmd, kubeconfigPath)
+	pvNameOutput, err := pvNameCmd.Output()
+	if err != nil {
+		return "", "", fmt.Errorf("failed to get PV name for PVC %s: %w", pvcName, err)
+	}
+	pvName := strings.TrimSpace(string(pvNameOutput))
+	if pvName == "" {
+		return "", "", nil
+	}
+
+	handleCmd := exec.Command("kubectl", "get", "pv", pvName,
+		"-o", "jsonpath={.spec.csi.volumeHandle}")
+	tools.WithKubeconfig(handleCmd, kubeconfigPath)
+	handleOutput, err := handleCmd.Output()
+	if err != nil {
+		return pvName, "", fmt.Errorf("failed to get volume handle for PV %s: %w", pvName, err)
+	}
+
+	return pvName, strings.TrimSpace(string(handleOutput)), nil
+}
+
+// deletePVIfExists deletes a PersistentVolume if it exists (handles Retain reclaim policy cleanup).
+func (l *LonghornNativeStrategy) deletePVIfExists(kubeconfigPath, pvName string) error {
+	cmd := exec.Command("kubectl", "delete", "pv", pvName, "--ignore-not-found", "--timeout=60s")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to delete PV %s: %w, output: %s", pvName, err, output)
+	}
+	return nil
+}
+
+// deleteLonghornVolumeIfExists deletes a Longhorn Volume CR if it exists.
+func (l *LonghornNativeStrategy) deleteLonghornVolumeIfExists(kubeconfigPath, volumeName string) error {
+	cmd := exec.Command("kubectl", "delete", "volumes.longhorn.io", volumeName,
+		"-n", "longhorn-system", "--ignore-not-found", "--timeout=60s")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to delete Longhorn volume %s: %w, output: %s", volumeName, err, output)
+	}
+	return nil
+}
+
+// createPVC creates a PVC pre-bound to a specific PV by name.
+func (l *LonghornNativeStrategy) createPVC(kubeconfigPath, pvcName, namespace, pvName, size, accessMode string) error {
+	pvcYAML := fmt.Sprintf(`apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: %s
+  namespace: %s
+spec:
+  accessModes:
+    - %s
+  resources:
+    requests:
+      storage: %s
+  storageClassName: longhorn
+  volumeName: %s
+`, pvcName, namespace, accessMode, size, pvName)
+
+	cmd := exec.Command("kubectl", "apply", "-f", "-")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	cmd.Stdin = strings.NewReader(pvcYAML)
+
+	if output, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to create PVC %s: %w, output: %s", pvcName, err, output)
+	}
+	return nil
+}
+
+// waitForPVCBound polls until the PVC reaches Bound status.
+func (l *LonghornNativeStrategy) waitForPVCBound(kubeconfigPath, namespace, pvcName string) error {
+	for range 60 {
+		cmd := exec.Command("kubectl", "get", "pvc", pvcName, "-n", namespace,
+			"-o", "jsonpath={.status.phase}")
+		tools.WithKubeconfig(cmd, kubeconfigPath)
+		if output, err := cmd.Output(); err == nil && strings.TrimSpace(string(output)) == "Bound" {
+			return nil
+		}
+		time.Sleep(2 * time.Second)
+	}
+	return fmt.Errorf("timeout waiting for PVC %s in %s to be Bound", pvcName, namespace)
+}
--- a/api/internal/backup/strategies/mysql.go
+++ b/api/internal/backup/strategies/mysql.go
@@ -94,7 +94,7 @@ func (m *MySQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.BackupDest

 	size, err := dest.Put(key, reader)
 	if err != nil {
-		cmd.Process.Kill()
+		_ = cmd.Process.Kill()
 		return fmt.Errorf("failed to upload backup: %w", err)
 	}

--- a/api/internal/backup/strategies/postgres.go
+++ b/api/internal/backup/strategies/postgres.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"log/slog"
 	"os"
 	"os/exec"
 	"strings"
@@ -90,7 +91,7 @@ func (p *PostgreSQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.Backu

 	size, err := dest.Put(key, reader)
 	if err != nil {
-		cmd.Process.Kill()
+		_ = cmd.Process.Kill()
 		return fmt.Errorf("failed to upload backup: %w", err)
 	}

@@ -101,7 +102,7 @@ func (p *PostgreSQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.Backu
 	// Also backup globals (users, roles, etc)
 	globalsKey := fmt.Sprintf("postgres/%s/%s/%s-globals.sql", plan.Instance, plan.App, plan.Timestamp)
 	if err := p.backupGlobals(kubeconfigPath, dest, globalsKey); err != nil {
-		fmt.Printf("Warning: failed to backup PostgreSQL globals: %v\n", err)
+		slog.Error("postgres globals backup failed", "component", "postgres", "error", err)
 		globalsKey = ""
 	}

@@ -165,7 +166,7 @@ func (p *PostgreSQLStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.Back
 		fmt.Sprintf("DROP DATABASE IF EXISTS %s", standbyDbName))
 	tools.WithKubeconfig(dropCmd, kubeconfigPath)
 	if output, err := dropCmd.CombinedOutput(); err != nil {
-		fmt.Printf("Warning: failed to drop database %s: %v, output: %s\n", standbyDbName, err, output)
+		slog.Error("failed to drop database", "component", "postgres", "database", standbyDbName, "error", err, "output", string(output))
 	}

 	// Create standby database
@@ -184,7 +185,7 @@ func (p *PostgreSQLStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.Back
 			fmt.Sprintf("GRANT ALL PRIVILEGES ON DATABASE %s TO %s", standbyDbName, dbUser))
 		tools.WithKubeconfig(grantCmd, kubeconfigPath)
 		if output, err := grantCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to grant privileges: %v, output: %s\n", err, output)
+			slog.Error("failed to grant privileges", "component", "postgres", "error", err, "output", string(output))
 		}
 	}

@@ -232,7 +233,7 @@ ALTER SCHEMA public OWNER TO %s;`, dbUser, dbUser, dbUser, dbUser)
 			"psql", "-U", "postgres", "-d", standbyDbName, "-c", ownershipSQL)
 		tools.WithKubeconfig(ownerCmd, kubeconfigPath)
 		if output, err := ownerCmd.CombinedOutput(); err != nil {
-			fmt.Printf("Warning: failed to transfer ownership: %v, output: %s\n", err, output)
+			slog.Error("failed to transfer ownership", "component", "postgres", "error", err, "output", string(output))
 		}
 	}

@@ -289,7 +290,7 @@ func (p *PostgreSQLStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
 		"psql", "-U", "postgres", "-d", "postgres", "-c",
 		fmt.Sprintf("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '%s' AND pid <> pg_backend_pid()", previousDbName))
 	tools.WithKubeconfig(terminateCmd, kubeconfigPath)
-	terminateCmd.CombinedOutput() // best effort
+	_, _ = terminateCmd.CombinedOutput() // best effort

 	// Drop the old database
 	dropCmd := exec.Command("kubectl", "exec", "-n", "postgres", podName, "--",
@@ -363,7 +364,7 @@ func (p *PostgreSQLStrategy) backupGlobals(kubeconfigPath string, dest btypes.Ba
 	}()

 	if _, err := dest.Put(key, reader); err != nil {
-		cmd.Process.Kill()
+		_ = cmd.Process.Kill()
 		return err
 	}

@@ -388,6 +389,11 @@ func (p *PostgreSQLStrategy) getDatabaseName(instanceName, appName string) strin
 			if dbName, ok := appConfig["dbName"].(string); ok && dbName != "" {
 				return dbName
 			}
+			if db, ok := appConfig["db"].(map[string]interface{}); ok {
+				if dbName, ok := db["name"].(string); ok && dbName != "" {
+					return dbName
+				}
+			}
 		}
 	}

@@ -415,6 +421,11 @@ func (p *PostgreSQLStrategy) getAppUser(instanceName, appName string) string {
 			if dbUsername, ok := appConfig["dbUsername"].(string); ok && dbUsername != "" {
 				return dbUsername
 			}
+			if db, ok := appConfig["db"].(map[string]interface{}); ok {
+				if dbUser, ok := db["user"].(string); ok && dbUser != "" {
+					return dbUser
+				}
+			}
 		}
 	}

--- a/api/internal/backup/strategies/postgres_test.go
+++ b/api/internal/backup/strategies/postgres_test.go
@@ -3,6 +3,7 @@ package strategies
 import (
 	"bytes"
 	"io"
+	"os"
 	"strings"
 	"testing"
 	"time"
@@ -163,11 +164,151 @@ func TestPostgreSQLStrategy_Verify(t *testing.T) {
 	}
 }

-func TestPostgreSQLStrategy_GetDatabaseInfo(t *testing.T) {
-	s := &PostgreSQLStrategy{
-		dataDir: "/test/data",
+func TestPostgreSQLStrategy_GetDatabaseName(t *testing.T) {
+	tests := []struct {
+		name     string
+		config   string
+		appName  string
+		expected string
+	}{
+		{
+			name: "flat dbName key",
+			config: `apps:
+  myapp:
+    dbName: my_database
+`,
+			appName:  "myapp",
+			expected: "my_database",
+		},
+		{
+			name: "nested db.name key",
+			config: `apps:
+  e2e-test-app:
+    namespace: e2e-test-app
+    db:
+      host: postgres
+      name: e2e_test_app
+      user: e2e_test_app
+`,
+			appName:  "e2e-test-app",
+			expected: "e2e_test_app",
+		},
+		{
+			name: "flat key takes precedence over nested",
+			config: `apps:
+  myapp:
+    dbName: flat_name
+    db:
+      name: nested_name
+`,
+			appName:  "myapp",
+			expected: "flat_name",
+		},
+		{
+			name: "no config falls back to appName",
+			config: `apps:
+  myapp:
+    namespace: myapp
+`,
+			appName:  "myapp",
+			expected: "myapp",
+		},
+		{
+			name:     "missing app falls back to appName",
+			config:   `apps: {}`,
+			appName:  "missing-app",
+			expected: "missing-app",
+		},
 	}

-	assert.NotNil(t, s)
-	assert.Equal(t, "/test/data", s.dataDir)
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpDir := t.TempDir()
+			instanceDir := tmpDir + "/instances/test-instance"
+			err := os.MkdirAll(instanceDir, 0755)
+			assert.NoError(t, err)
+			err = os.WriteFile(instanceDir+"/config.yaml", []byte(tt.config), 0644)
+			assert.NoError(t, err)
+
+			s := &PostgreSQLStrategy{dataDir: tmpDir}
+			result := s.getDatabaseName("test-instance", tt.appName)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestPostgreSQLStrategy_GetAppUser(t *testing.T) {
+	tests := []struct {
+		name     string
+		config   string
+		appName  string
+		expected string
+	}{
+		{
+			name: "flat dbUser key",
+			config: `apps:
+  myapp:
+    dbUser: my_user
+`,
+			appName:  "myapp",
+			expected: "my_user",
+		},
+		{
+			name: "flat dbUsername key",
+			config: `apps:
+  myapp:
+    dbUsername: my_username
+`,
+			appName:  "myapp",
+			expected: "my_username",
+		},
+		{
+			name: "nested db.user key",
+			config: `apps:
+  e2e-test-app:
+    namespace: e2e-test-app
+    db:
+      host: postgres
+      name: e2e_test_app
+      user: e2e_test_app
+`,
+			appName:  "e2e-test-app",
+			expected: "e2e_test_app",
+		},
+		{
+			name: "flat key takes precedence over nested",
+			config: `apps:
+  myapp:
+    dbUser: flat_user
+    db:
+      user: nested_user
+`,
+			appName:  "myapp",
+			expected: "flat_user",
+		},
+		{
+			name: "no user config falls back to appName",
+			config: `apps:
+  myapp:
+    namespace: myapp
+`,
+			appName:  "myapp",
+			expected: "myapp",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpDir := t.TempDir()
+			instanceDir := tmpDir + "/instances/test-instance"
+			err := os.MkdirAll(instanceDir, 0755)
+			assert.NoError(t, err)
+			err = os.WriteFile(instanceDir+"/config.yaml", []byte(tt.config), 0644)
+			assert.NoError(t, err)
+
+			s := &PostgreSQLStrategy{dataDir: tmpDir}
+			result := s.getAppUser("test-instance", tt.appName)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
 }
--- a/api/internal/backup/types/types.go
+++ b/api/internal/backup/types/types.go
@@ -33,8 +33,10 @@ type RecoveryPlan struct {
 	App          string               `yaml:"app"          json:"app"`
 	Instance     string               `yaml:"instance"     json:"instance"`
 	Timestamp    string               `yaml:"timestamp"    json:"timestamp"`
+	Version      string               `yaml:"version"      json:"version,omitempty"`
 	Status       string               `yaml:"status"       json:"status"` // backing_up, backed_up, restoring, restored, switching, switched, cleaning_up, cleaned_up, failed
 	Error        string               `yaml:"error"        json:"error,omitempty"`
+	RestoreMode  string               `yaml:"restoreMode,omitempty" json:"restoreMode,omitempty"` // "in-place" or "" (default: standby blue/green)
 	Source       RecoverySource       `yaml:"source"       json:"source"`
 	StandbyColor string               `yaml:"standbyColor" json:"standbyColor"`
 	Standby      RecoveryStandby      `yaml:"standby"      json:"standby,omitempty"`
@@ -141,8 +143,8 @@ type BackupInfo struct {

 // ComponentBackup represents a single backup component (legacy, kept for compatibility)
 type ComponentBackup struct {
-	Type     string                 `json:"type"`     // "postgres", "mysql", "pvc", "config"
-	Name     string                 `json:"name"`     // Component identifier
+	Type     string                 `json:"type"` // "postgres", "mysql", "pvc", "config"
+	Name     string                 `json:"name"` // Component identifier
 	Size     int64                  `json:"size"`
 	Location string                 `json:"location"` // Path in destination
 	Metadata map[string]interface{} `json:"metadata"`
@@ -175,28 +177,28 @@ type ProgressCallback func(progress int, message string)

 // BackupConfiguration represents instance-level backup configuration
 type BackupConfiguration struct {
-	Destination  DestinationConfig  `yaml:"destination"`
-	Retention    RetentionPolicy    `yaml:"retention"`
-	Schedules    []BackupSchedule   `yaml:"schedules,omitempty"`
-	Verification VerificationConfig `yaml:"verification"`
+	Destination  DestinationConfig  `yaml:"destination"          json:"destination"`
+	Retention    RetentionPolicy    `yaml:"retention"            json:"retention"`
+	Schedules    []BackupSchedule   `yaml:"schedules,omitempty"  json:"schedules,omitempty"`
+	Verification VerificationConfig `yaml:"verification"         json:"verification"`
 }

 // BackupSchedule defines a per-app or cluster backup schedule
 type BackupSchedule struct {
-	ID         string           `yaml:"id"         json:"id"`
-	Name       string           `yaml:"name"       json:"name"`
-	TargetType string           `yaml:"targetType" json:"target_type"` // "app" or "cluster"
-	TargetName string           `yaml:"targetName" json:"target_name"`
-	Frequency  string           `yaml:"frequency"  json:"frequency"` // "daily", "weekly", "monthly"
-	Time       string           `yaml:"time"       json:"time"`      // "HH:MM" local time
-	DayOfWeek  int              `yaml:"dayOfWeek"  json:"day_of_week,omitempty"` // 0=Sun..6=Sat (weekly)
-	DayOfMonth int              `yaml:"dayOfMonth" json:"day_of_month,omitempty"` // 1-28 (monthly)
+	ID         string             `yaml:"id"         json:"id"`
+	Name       string             `yaml:"name"       json:"name"`
+	TargetType string             `yaml:"targetType" json:"target_type"` // "app" or "cluster"
+	TargetName string             `yaml:"targetName" json:"target_name"`
+	Frequency  string             `yaml:"frequency"  json:"frequency"`              // "daily", "weekly", "monthly"
+	Time       string             `yaml:"time"       json:"time"`                   // "HH:MM" local time
+	DayOfWeek  int                `yaml:"dayOfWeek"  json:"day_of_week,omitempty"`  // 0=Sun..6=Sat (weekly)
+	DayOfMonth int                `yaml:"dayOfMonth" json:"day_of_month,omitempty"` // 1-28 (monthly)
 	Retention  *ScheduleRetention `yaml:"retention,omitempty" json:"retention,omitempty"`
-	Enabled    bool             `yaml:"enabled"    json:"enabled"`
-	LastRun    *time.Time       `yaml:"lastRun,omitempty"  json:"last_run,omitempty"`
-	NextRun    *time.Time       `yaml:"nextRun,omitempty"  json:"next_run,omitempty"`
-	CreatedAt  time.Time        `yaml:"createdAt"  json:"created_at"`
-	UpdatedAt  time.Time        `yaml:"updatedAt"  json:"updated_at"`
+	Enabled    bool               `yaml:"enabled"    json:"enabled"`
+	LastRun    *time.Time         `yaml:"lastRun,omitempty"  json:"last_run,omitempty"`
+	NextRun    *time.Time         `yaml:"nextRun,omitempty"  json:"next_run,omitempty"`
+	CreatedAt  time.Time          `yaml:"createdAt"  json:"created_at"`
+	UpdatedAt  time.Time          `yaml:"updatedAt"  json:"updated_at"`
 }

 // ScheduleRetention overrides the instance-level retention for a specific schedule
@@ -207,53 +209,53 @@ type ScheduleRetention struct {

 // DestinationConfig configures where backups are stored
 type DestinationConfig struct {
-	Type  string       `yaml:"type"` // "s3", "azure", "nfs", "local"
-	S3    *S3Config    `yaml:"s3,omitempty"`
-	Azure *AzureConfig `yaml:"azure,omitempty"`
-	NFS   *NFSConfig   `yaml:"nfs,omitempty"`
-	Local *LocalConfig `yaml:"local,omitempty"`
+	Type  string       `yaml:"type"  json:"type"` // "s3", "azure", "nfs", "local"
+	S3    *S3Config    `yaml:"s3,omitempty"    json:"s3,omitempty"`
+	Azure *AzureConfig `yaml:"azure,omitempty" json:"azure,omitempty"`
+	NFS   *NFSConfig   `yaml:"nfs,omitempty"   json:"nfs,omitempty"`
+	Local *LocalConfig `yaml:"local,omitempty" json:"local,omitempty"`
 }

 // S3Config configures S3 backup destination
 type S3Config struct {
-	Bucket         string `yaml:"bucket"`
-	Region         string `yaml:"region"`
-	Endpoint       string `yaml:"endpoint,omitempty"` // For S3-compatible services
-	AccessKeyID    string `yaml:"-"`                  // Loaded from secrets.yaml
-	SecretAccessKey string `yaml:"-"`                 // Loaded from secrets.yaml
+	Bucket          string `yaml:"bucket"             json:"bucket"`
+	Region          string `yaml:"region"             json:"region"`
+	Endpoint        string `yaml:"endpoint,omitempty" json:"endpoint,omitempty"` // For S3-compatible services
+	AccessKeyID     string `yaml:"-"                  json:"-"`                  // Loaded from secrets.yaml
+	SecretAccessKey string `yaml:"-"                 json:"-"`                   // Loaded from secrets.yaml
 }

 // AzureConfig configures Azure Blob Storage destination
 type AzureConfig struct {
-	Container      string `yaml:"container"`
-	StorageAccount string `yaml:"storageAccount"`
-	AccessKey      string `yaml:"-"` // Loaded from secrets.yaml
+	Container      string `yaml:"container"       json:"container"`
+	StorageAccount string `yaml:"storageAccount"  json:"storageAccount"`
+	AccessKey      string `yaml:"-"               json:"-"` // Loaded from secrets.yaml
 }

 // NFSConfig configures NFS backup destination
 type NFSConfig struct {
-	Server       string `yaml:"server"`
-	Path         string `yaml:"path"`
-	MountPoint   string `yaml:"mountPoint,omitempty"`
-	MountOptions string `yaml:"mountOptions,omitempty"`
+	Server       string `yaml:"server"               json:"server"`
+	Path         string `yaml:"path"                 json:"path"`
+	MountPoint   string `yaml:"mountPoint,omitempty"  json:"mountPoint,omitempty"`
+	MountOptions string `yaml:"mountOptions,omitempty" json:"mountOptions,omitempty"`
 }

 // LocalConfig configures local filesystem backup destination
 type LocalConfig struct {
-	Path string `yaml:"path"`
+	Path string `yaml:"path" json:"path"`
 }

 // RetentionPolicy defines how long to keep backups
 type RetentionPolicy struct {
-	Daily   int `yaml:"daily"`
-	Weekly  int `yaml:"weekly"`
-	Monthly int `yaml:"monthly"`
-	Yearly  int `yaml:"yearly"`
+	Daily   int `yaml:"daily"   json:"daily"`
+	Weekly  int `yaml:"weekly"  json:"weekly"`
+	Monthly int `yaml:"monthly" json:"monthly"`
+	Yearly  int `yaml:"yearly"  json:"yearly"`
 }

 // VerificationConfig configures backup verification
 type VerificationConfig struct {
-	Enabled      bool   `yaml:"enabled"`
-	Schedule     string `yaml:"schedule"`     // Cron expression
-	RandomSample bool   `yaml:"randomSample"` // Test random backup each time
+	Enabled      bool   `yaml:"enabled"      json:"enabled"`
+	Schedule     string `yaml:"schedule"     json:"schedule"`     // Cron expression
+	RandomSample bool   `yaml:"randomSample" json:"randomSample"` // Test random backup each time
 }
--- a/api/internal/cluster/cluster.go
+++ b/api/internal/cluster/cluster.go
@@ -4,7 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -80,14 +80,16 @@ func (m *Manager) GenerateConfig(instanceName string, config *ClusterConfig) err
 		return nil
 	}

+	slog.Info("generating cluster config", "component", "cluster", "instance", instanceName, "cluster", config.ClusterName, "vip", config.VIP)
+
 	// Ensure generated directory exists
 	if err := storage.EnsureDir(generatedDir, 0755); err != nil {
 		return fmt.Errorf("failed to create generated directory: %w", err)
 	}

 	// Generate secrets
-	cmd := exec.Command("talosctl", "gen", "secrets")
-	cmd.Dir = generatedDir
+	secretsPath := filepath.Join(generatedDir, "secrets.yaml")
+	cmd := exec.Command("talosctl", "gen", "secrets", "--output-file", secretsPath)
 	output, err := cmd.CombinedOutput()
 	if err != nil {
 		return fmt.Errorf("failed to generate secrets: %w\nOutput: %s", err, string(output))
@@ -96,11 +98,11 @@ func (m *Manager) GenerateConfig(instanceName string, config *ClusterConfig) err
 	// Generate config with secrets
 	endpoint := fmt.Sprintf("https://%s:6443", config.VIP)
 	cmd = exec.Command("talosctl", "gen", "config",
-		"--with-secrets", "secrets.yaml",
+		"--with-secrets", secretsPath,
+		"--output-dir", generatedDir,
 		config.ClusterName,
 		endpoint,
 	)
-	cmd.Dir = generatedDir
 	output, err = cmd.CombinedOutput()
 	if err != nil {
 		return fmt.Errorf("failed to generate config: %w\nOutput: %s", err, string(output))
@@ -117,9 +119,12 @@ func (m *Manager) Bootstrap(instanceName, nodeName string) (string, error) {
 		return "", fmt.Errorf("failed to start bootstrap operation: %w", err)
 	}

+	slog.Info("starting cluster bootstrap", "component", "cluster", "instance", instanceName, "node", nodeName, "operationId", opID)
+
 	// Run bootstrap asynchronously
 	go func() {
 		if err := m.runBootstrapWithTracking(instanceName, nodeName, opID); err != nil {
+			slog.Error("cluster bootstrap failed", "component", "cluster", "instance", instanceName, "node", nodeName, "error", err)
 			_ = m.opsMgr.Update(instanceName, opID, "failed", err.Error(), 0)
 		}
 	}()
@@ -191,6 +196,7 @@ func (m *Manager) runBootstrapWithTracking(instanceName, nodeName, opID string)
 	}

 	// Mark as completed
+	slog.Info("cluster bootstrap completed", "component", "cluster", "instance", instanceName)
 	_ = m.opsMgr.Update(instanceName, opID, "completed", "Bootstrap completed successfully", 100)
 	return nil
 }
@@ -385,7 +391,7 @@ func (m *Manager) retrieveKubeconfigFromCluster(instanceName, nodeIP string, tim
 		tools.WithTalosconfig(cmdKubeconfig, talosconfigPath)

 		if output, err := cmdKubeconfig.CombinedOutput(); err == nil {
-			log.Printf("Successfully retrieved kubeconfig for instance %s", instanceName)
+			slog.Info("kubeconfig retrieved", "component", "cluster", "instance", instanceName)
 			return nil
 		} else {
 			// Check if we've exceeded deadline
@@ -424,13 +430,15 @@ func (m *Manager) RegenerateKubeconfig(instanceName string) error {
 		return fmt.Errorf("control plane VIP not configured in cluster.nodes.control.vip")
 	}

-	log.Printf("Regenerating kubeconfig for instance %s from cluster VIP %s", instanceName, vip)
+	slog.Info("regenerating kubeconfig", "component", "cluster", "instance", instanceName, "vip", vip)
 	// Use shorter timeout for manual regeneration (cluster should already be running)
 	return m.retrieveKubeconfigFromCluster(instanceName, vip, 30*time.Second)
 }

 // ConfigureEndpoints updates talosconfig to use VIP and retrieves kubeconfig
 func (m *Manager) ConfigureEndpoints(instanceName string, includeNodes bool) error {
+	slog.Info("configuring cluster endpoints", "component", "cluster", "instance", instanceName, "includeNodes", includeNodes)
+
 	configPath := tools.GetInstanceConfigPath(m.dataDir, instanceName)
 	talosconfigPath := tools.GetTalosconfigPath(m.dataDir, instanceName)

@@ -650,7 +658,7 @@ func (m *Manager) GetKubeconfig(instanceName string) (string, error) {

 // GetTalosconfig returns the talosconfig for the cluster
 func (m *Manager) GetTalosconfig(instanceName string) (string, error) {
-	talosconfigPath := filepath.Join(m.GetGeneratedDir(instanceName), "talosconfig")
+	talosconfigPath := tools.GetTalosconfigPath(m.dataDir, instanceName)

 	if !storage.FileExists(talosconfigPath) {
 		return "", fmt.Errorf("talosconfig not found - cluster may not be initialized")
@@ -709,6 +717,8 @@ func (m *Manager) Reset(instanceName string, confirm bool) error {
 		return fmt.Errorf("reset requires confirmation")
 	}

+	slog.Info("resetting cluster", "component", "cluster", "instance", instanceName)
+
 	// This is a destructive operation
 	// Real implementation would:
 	// 1. Reset all nodes via talosctl reset
@@ -725,38 +735,3 @@ func (m *Manager) Reset(instanceName string, confirm bool) error {
 	return nil
 }

-// ConfigureContext configures talosctl context for the cluster
-func (m *Manager) ConfigureContext(instanceName, clusterName string) error {
-	talosconfigPath := filepath.Join(m.GetGeneratedDir(instanceName), "talosconfig")
-
-	if !storage.FileExists(talosconfigPath) {
-		return fmt.Errorf("talosconfig not found")
-	}
-
-	// Merge talosconfig into user's talosctl config
-	cmd := exec.Command("talosctl", "config", "merge", talosconfigPath)
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		return fmt.Errorf("failed to merge talosconfig: %w\nOutput: %s", err, string(output))
-	}
-
-	// Set context
-	cmd = exec.Command("talosctl", "config", "context", clusterName)
-	output, err = cmd.CombinedOutput()
-	if err != nil {
-		return fmt.Errorf("failed to set context: %w\nOutput: %s", err, string(output))
-	}
-
-	return nil
-}
-
-// HasContext checks if talosctl context exists
-func (m *Manager) HasContext(clusterName string) (bool, error) {
-	cmd := exec.Command("talosctl", "config", "contexts")
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		return false, fmt.Errorf("failed to list contexts: %w", err)
-	}
-
-	return strings.Contains(string(output), clusterName), nil
-}
--- a/api/internal/config/config.go
+++ b/api/internal/config/config.go
@@ -2,6 +2,7 @@ package config

 import (
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -144,3 +145,57 @@ func SaveCloudConfig(config *InstanceConfig, configPath string) error {

 	return os.WriteFile(configPath, data, 0644)
 }
+
+// DeepMerge recursively merges src into dst, with src values taking precedence.
+// Nested maps are merged recursively; all other types are overwritten by src.
+func DeepMerge(dst, src map[string]interface{}) map[string]interface{} {
+	result := make(map[string]interface{})
+	for k, v := range dst {
+		result[k] = v
+	}
+	for k, v := range src {
+		if srcMap, ok := v.(map[string]interface{}); ok {
+			if dstMap, ok := result[k].(map[string]interface{}); ok {
+				result[k] = DeepMerge(dstMap, srcMap)
+				continue
+			}
+		}
+		result[k] = v
+	}
+	return result
+}
+
+// LoadMergedInstanceConfig loads the global config as a base and merges the
+// instance config on top. Returns the merged result as an untyped map suitable
+// for passing to gomplate as template context.
+// If the global config is missing, returns the instance config alone.
+func LoadMergedInstanceConfig(dataDir, instanceName string) (map[string]interface{}, error) {
+	instanceConfigPath := filepath.Join(dataDir, "instances", instanceName, "config.yaml")
+	globalConfigPath := filepath.Join(dataDir, "config.yaml")
+
+	instanceData, err := os.ReadFile(instanceConfigPath)
+	if err != nil {
+		return nil, fmt.Errorf("reading instance config %s: %w", instanceConfigPath, err)
+	}
+
+	var instanceMap map[string]interface{}
+	if err := yaml.Unmarshal(instanceData, &instanceMap); err != nil {
+		return nil, fmt.Errorf("parsing instance config: %w", err)
+	}
+
+	globalData, err := os.ReadFile(globalConfigPath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			slog.Warn("no global config found, using instance config only", "path", globalConfigPath)
+			return instanceMap, nil
+		}
+		return nil, fmt.Errorf("reading global config %s: %w", globalConfigPath, err)
+	}
+
+	var globalMap map[string]interface{}
+	if err := yaml.Unmarshal(globalData, &globalMap); err != nil {
+		return nil, fmt.Errorf("parsing global config: %w", err)
+	}
+
+	return DeepMerge(globalMap, instanceMap), nil
+}
--- a/api/internal/config/config_test.go
+++ b/api/internal/config/config_test.go
@@ -5,6 +5,8 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+
+	"gopkg.in/yaml.v3"
 )

 // Test: LoadGlobalConfig loads valid configuration
@@ -541,3 +543,157 @@ func TestInstanceConfig_RoundTrip(t *testing.T) {
 		t.Errorf("cluster name mismatch: got %q, want %q", loaded.Cluster.Name, original.Cluster.Name)
 	}
 }
+
+func TestDeepMerge(t *testing.T) {
+	tests := []struct {
+		name     string
+		dst      map[string]interface{}
+		src      map[string]interface{}
+		expected map[string]interface{}
+	}{
+		{
+			name:     "src overrides dst flat keys",
+			dst:      map[string]interface{}{"a": "1", "b": "2"},
+			src:      map[string]interface{}{"b": "3", "c": "4"},
+			expected: map[string]interface{}{"a": "1", "b": "3", "c": "4"},
+		},
+		{
+			name: "nested maps merge recursively",
+			dst: map[string]interface{}{
+				"cloud": map[string]interface{}{
+					"router": map[string]interface{}{"ip": "192.168.1.1"},
+				},
+			},
+			src: map[string]interface{}{
+				"cloud": map[string]interface{}{
+					"domain": "example.com",
+				},
+			},
+			expected: map[string]interface{}{
+				"cloud": map[string]interface{}{
+					"router": map[string]interface{}{"ip": "192.168.1.1"},
+					"domain": "example.com",
+				},
+			},
+		},
+		{
+			name: "src nested key overrides dst nested key",
+			dst: map[string]interface{}{
+				"cloud": map[string]interface{}{"domain": "old.com"},
+			},
+			src: map[string]interface{}{
+				"cloud": map[string]interface{}{"domain": "new.com"},
+			},
+			expected: map[string]interface{}{
+				"cloud": map[string]interface{}{"domain": "new.com"},
+			},
+		},
+		{
+			name:     "empty src returns dst",
+			dst:      map[string]interface{}{"a": "1"},
+			src:      map[string]interface{}{},
+			expected: map[string]interface{}{"a": "1"},
+		},
+		{
+			name:     "empty dst returns src",
+			dst:      map[string]interface{}{},
+			src:      map[string]interface{}{"a": "1"},
+			expected: map[string]interface{}{"a": "1"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := DeepMerge(tt.dst, tt.src)
+			resultYAML, _ := yaml.Marshal(result)
+			expectedYAML, _ := yaml.Marshal(tt.expected)
+			if string(resultYAML) != string(expectedYAML) {
+				t.Errorf("DeepMerge() =\n%s\nwant:\n%s", resultYAML, expectedYAML)
+			}
+		})
+	}
+}
+
+func TestLoadMergedInstanceConfig(t *testing.T) {
+	dataDir := t.TempDir()
+	instanceName := "test"
+	instancePath := filepath.Join(dataDir, "instances", instanceName)
+	os.MkdirAll(instancePath, 0755)
+
+	globalConfig := `operator:
+    email: test@example.com
+cloud:
+    router:
+        ip: 192.168.1.1
+`
+	instanceConfig := `cluster:
+    name: test
+    nodes:
+        control:
+            vip: 192.168.1.100
+cloud:
+    domain: cloud.example.com
+`
+
+	os.WriteFile(filepath.Join(dataDir, "config.yaml"), []byte(globalConfig), 0644)
+	os.WriteFile(filepath.Join(instancePath, "config.yaml"), []byte(instanceConfig), 0644)
+
+	merged, err := LoadMergedInstanceConfig(dataDir, instanceName)
+	if err != nil {
+		t.Fatalf("LoadMergedInstanceConfig() error: %v", err)
+	}
+
+	cloud, ok := merged["cloud"].(map[string]interface{})
+	if !ok {
+		t.Fatal("missing cloud key in merged config")
+	}
+
+	router, ok := cloud["router"].(map[string]interface{})
+	if !ok {
+		t.Fatal("missing cloud.router key — global config not merged")
+	}
+	if router["ip"] != "192.168.1.1" {
+		t.Errorf("cloud.router.ip = %v, want 192.168.1.1", router["ip"])
+	}
+
+	if cloud["domain"] != "cloud.example.com" {
+		t.Errorf("cloud.domain = %v, want cloud.example.com", cloud["domain"])
+	}
+
+	cluster, ok := merged["cluster"].(map[string]interface{})
+	if !ok {
+		t.Fatal("missing cluster key — instance config not merged")
+	}
+	if cluster["name"] != "test" {
+		t.Errorf("cluster.name = %v, want test", cluster["name"])
+	}
+
+	operator, ok := merged["operator"].(map[string]interface{})
+	if !ok {
+		t.Fatal("missing operator key — global config not merged")
+	}
+	if operator["email"] != "test@example.com" {
+		t.Errorf("operator.email = %v, want test@example.com", operator["email"])
+	}
+}
+
+func TestLoadMergedInstanceConfig_NoGlobalConfig(t *testing.T) {
+	dataDir := t.TempDir()
+	instancePath := filepath.Join(dataDir, "instances", "test")
+	os.MkdirAll(instancePath, 0755)
+
+	os.WriteFile(filepath.Join(instancePath, "config.yaml"), []byte("cluster:\n    name: test\n"), 0644)
+
+	merged, err := LoadMergedInstanceConfig(dataDir, "test")
+	if err != nil {
+		t.Fatalf("LoadMergedInstanceConfig() error: %v", err)
+	}
+
+	cluster, ok := merged["cluster"].(map[string]interface{})
+	if !ok {
+		t.Fatal("missing cluster key")
+	}
+	if cluster["name"] != "test" {
+		t.Errorf("cluster.name = %v, want test", cluster["name"])
+	}
+}
--- a/api/internal/config/manager.go
+++ b/api/internal/config/manager.go
@@ -2,7 +2,7 @@ package config

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"path/filepath"

 	"github.com/wild-cloud/wild-central/daemon/internal/network"
@@ -41,12 +41,11 @@ func (m *Manager) EnsureGlobalConfig(dataDir string) error {
 	// Detect network configuration
 	netInfo, err := network.DetectNetworkInfo()
 	if err != nil {
-		log.Printf("Warning: Could not detect network info, using empty defaults: %v", err)
+		slog.Info("network detection failed, using defaults", "component", "config", "error", err)
 	} else {
 		// Set detected values
 		initialConfig.Cloud.Router.IP = netInfo.Gateway
-		log.Printf("Detected network: Gateway=%s, Interface=%s",
-			netInfo.Gateway, netInfo.PrimaryInterface)
+		slog.Info("detected network", "component", "config", "gateway", netInfo.Gateway, "interface", netInfo.PrimaryInterface)
 	}

 	// Ensure data directory exists
--- a/api/internal/config/manager_test.go
+++ b/api/internal/config/manager_test.go
@@ -13,11 +13,8 @@ import (
 // Test: NewManager creates manager successfully
 func TestNewManager(t *testing.T) {
 	m := NewManager()
-	if m == nil {
-		t.Fatal("NewManager returned nil")
-	}
-	if m.yq == nil {
-		t.Error("Manager.yq is nil")
+	if m == nil || m.yq == nil {
+		t.Fatal("NewManager returned nil or Manager.yq is nil")
 	}
 }

--- a/api/internal/data/paths.go
+++ b/api/internal/data/paths.go
@@ -2,7 +2,7 @@ package data

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"path/filepath"
 )
@@ -42,10 +42,10 @@ func (m *Manager) Initialize() error {
 		} else {
 			dataDir = filepath.Join(cwd, "data")
 		}
-		log.Printf("Running in development mode, using data directory: %s", dataDir)
+		slog.Info("data directory configured", "component", "data", "mode", "development", "path", dataDir)
 	} else {
 		dataDir = "/var/lib/wild-central"
-		log.Printf("Running in production mode, using data directory: %s", dataDir)
+		slog.Info("data directory configured", "component", "data", "mode", "production", "path", dataDir)
 	}

 	m.dataDir = dataDir
@@ -60,7 +60,7 @@ func (m *Manager) Initialize() error {
 		}
 	}

-	log.Printf("Data directory structure initialized at: %s", dataDir)
+	slog.Info("data directory initialized", "component", "data", "path", dataDir)
 	return nil
 }

--- a/api/internal/discovery/discovery.go
+++ b/api/internal/discovery/discovery.go
@@ -3,6 +3,7 @@ package discovery
 import (
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net"
 	"os"
 	"path/filepath"
@@ -10,6 +11,7 @@ import (
 	"time"

 	"github.com/wild-cloud/wild-central/daemon/internal/node"
+	"github.com/wild-cloud/wild-central/daemon/internal/sse"
 	"github.com/wild-cloud/wild-central/daemon/internal/storage"
 	"github.com/wild-cloud/wild-central/daemon/internal/tools"
 )
@@ -19,18 +21,20 @@ type Manager struct {
 	dataDir     string
 	nodeMgr     *node.Manager
 	talosctl    *tools.Talosctl
+	sseManager  *sse.Manager
 	discoveryMu sync.Mutex
 }

 // NewManager creates a new discovery manager
-func NewManager(dataDir string, instanceName string) *Manager {
+func NewManager(dataDir string, instanceName string, sseManager *sse.Manager) *Manager {
 	// Get talosconfig path for the instance
 	talosconfigPath := tools.GetTalosconfigPath(dataDir, instanceName)

 	return &Manager{
-		dataDir:  dataDir,
-		nodeMgr:  node.NewManager(dataDir, instanceName),
-		talosctl: tools.NewTalosconfigWithConfig(talosconfigPath),
+		dataDir:    dataDir,
+		nodeMgr:    node.NewManager(dataDir, instanceName),
+		talosctl:   tools.NewTalosconfigWithConfig(talosconfigPath),
+		sseManager: sseManager,
 	}
 }

@@ -111,6 +115,8 @@ func (m *Manager) StartDiscovery(instanceName string, ipList []string) error {
 		return err
 	}

+	slog.Info("starting node discovery", "component", "discovery", "instance", instanceName, "addresses", len(ipList))
+
 	// Start discovery in background
 	go m.runDiscovery(instanceName, ipList)

@@ -172,7 +178,15 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
 		status.NodesFound = discoveredNodes
 		_ = m.writeDiscoveryStatus(instanceName, status)
 		m.discoveryMu.Unlock()
+
+		// Notify frontend of progress
+		m.broadcastDiscoveryEvent(instanceName)
 	}
+
+	// Notify frontend that discovery is complete
+	m.broadcastDiscoveryEvent(instanceName)
+
+	slog.Info("node discovery completed", "component", "discovery", "instance", instanceName, "found", len(discoveredNodes))
 }

 // probeNode attempts to detect if a node is running Talos in maintenance mode
@@ -219,6 +233,30 @@ func (m *Manager) DiscoverNodes(instanceName string, ipList []string) ([]Discove
 	return nodes, nil
 }

+// RemoveDiscoveredNode removes a node by IP from the discovery results
+func (m *Manager) RemoveDiscoveredNode(instanceName, ip string) {
+	m.discoveryMu.Lock()
+	defer m.discoveryMu.Unlock()
+
+	status, err := m.GetDiscoveryStatus(instanceName)
+	if err != nil || len(status.NodesFound) == 0 {
+		return
+	}
+
+	filtered := make([]DiscoveredNode, 0, len(status.NodesFound))
+	for _, n := range status.NodesFound {
+		if n.IP != ip {
+			filtered = append(filtered, n)
+		}
+	}
+
+	if len(filtered) < len(status.NodesFound) {
+		status.NodesFound = filtered
+		_ = m.writeDiscoveryStatus(instanceName, status)
+		m.broadcastDiscoveryEvent(instanceName)
+	}
+}
+
 // ClearDiscoveryStatus removes discovery status file
 func (m *Manager) ClearDiscoveryStatus(instanceName string) error {
 	statusPath := m.GetDiscoveryStatusPath(instanceName)
@@ -230,6 +268,17 @@ func (m *Manager) ClearDiscoveryStatus(instanceName string) error {
 	return os.Remove(statusPath)
 }

+// broadcastDiscoveryEvent sends an SSE event to notify the frontend
+func (m *Manager) broadcastDiscoveryEvent(instanceName string) {
+	if m.sseManager == nil {
+		return
+	}
+	m.sseManager.Broadcast(&sse.Event{
+		Type:         "discovery:updated",
+		InstanceName: instanceName,
+	})
+}
+
 // writeDiscoveryStatus writes discovery status to disk
 func (m *Manager) writeDiscoveryStatus(instanceName string, status *DiscoveryStatus) error {
 	discoveryDir := m.GetDiscoveryDir(instanceName)
--- a/api/internal/dnsmasq/config.go
+++ b/api/internal/dnsmasq/config.go
@@ -2,7 +2,7 @@ package dnsmasq

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"os/exec"
 	"strconv"
@@ -39,7 +39,7 @@ func (g *ConfigGenerator) Generate(cfg *config.GlobalConfig, clouds []config.Ins
 	// Get the Wild Central IP address
 	dnsIP, err := network.GetWildCentralIP()
 	if err != nil {
-		log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
+		slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "op", "generate", "error", err)
 		// Fall back to empty string if detection fails
 		dnsIP = ""
 	}
@@ -49,7 +49,7 @@ func (g *ConfigGenerator) Generate(cfg *config.GlobalConfig, clouds []config.Ins
 		// Point cloud domains to the cluster load balancer IP
 		loadBalancerIP := cloud.Cluster.LoadBalancerIp
 		if loadBalancerIP == "" {
-			log.Printf("Warning: No load balancer IP configured for instance %s, adding commented DNS config", cloud.Cluster.Name)
+			slog.Info("no load balancer IP configured, adding commented DNS config", "component", "dnsmasq", "instance", cloud.Cluster.Name)
 			// Add commented out entries for instances without load balancer
 			resolution_section += fmt.Sprintf("# No load balancer IP configured for instance %s\n", cloud.Cluster.Name)
 			resolution_section += fmt.Sprintf("# local=/%s/\n# address=/%s/<load-balancer-ip>\n", cloud.Cloud.InternalDomain, cloud.Cloud.InternalDomain)
@@ -92,7 +92,7 @@ log-dhcp
 func (g *ConfigGenerator) WriteConfig(cfg *config.GlobalConfig, clouds []config.InstanceConfig, configPath string) error {
 	configContent := g.Generate(cfg, clouds)

-	log.Printf("Writing dnsmasq config to: %s", configPath)
+	slog.Info("writing dnsmasq config", "component", "dnsmasq", "path", configPath)

 	if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
 		return fmt.Errorf("writing dnsmasq config: %w", err)
@@ -109,6 +109,7 @@ func (g *ConfigGenerator) RestartService() error {
 	if err != nil {
 		return fmt.Errorf("failed to restart dnsmasq: %w (output: %s)", err, string(output))
 	}
+	slog.Info("dnsmasq service restarted", "component", "dnsmasq")
 	return nil
 }

@@ -127,7 +128,7 @@ func (g *ConfigGenerator) GetStatus() (*ServiceStatus, error) {
 	// Get the Wild Central IP address
 	dnsIP, err := network.GetWildCentralIP()
 	if err != nil {
-		log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
+		slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "op", "getStatus", "error", err)
 		dnsIP = ""
 	}

@@ -201,7 +202,7 @@ func (g *ConfigGenerator) UpdateConfig(cfg *config.GlobalConfig, instances []con
 	configContent := g.Generate(cfg, instances)

 	// Write config
-	log.Printf("Writing dnsmasq config to: %s", g.configPath)
+	slog.Info("writing dnsmasq config", "component", "dnsmasq", "path", g.configPath)
 	if err := os.WriteFile(g.configPath, []byte(configContent), 0644); err != nil {
 		return fmt.Errorf("writing dnsmasq config: %w", err)
 	}
@@ -234,12 +235,12 @@ func (g *ConfigGenerator) ConfigureSystemDNS() error {
 		return fmt.Errorf("failed to write resolved.conf: %w", err)
 	}

-	log.Printf("Configured systemd-resolved to use DNS at %s", dnsIP)
+	slog.Info("configured systemd-resolved", "component", "dnsmasq", "dnsIP", dnsIP)

 	// Restart systemd-resolved to apply changes (via polkit)
 	cmd := exec.Command("systemctl", "restart", "systemd-resolved")
 	if output, err := cmd.CombinedOutput(); err != nil {
-		log.Printf("Warning: Failed to restart systemd-resolved: %v (output: %s)", err, string(output))
+		slog.Error("failed to restart systemd-resolved", "component", "dnsmasq", "error", err, "output", string(output))
 		// Don't return error - the config was written successfully
 	}

--- a/api/internal/dnsmasq/config_modular.go
+++ b/api/internal/dnsmasq/config_modular.go
@@ -2,7 +2,7 @@ package dnsmasq

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -22,7 +22,7 @@ func (g *ConfigGenerator) GenerateMainConfig(cfg *config.GlobalConfig) string {
 	// Get the Wild Central IP address
 	dnsIP, err := network.GetWildCentralIP()
 	if err != nil {
-		log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
+		slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
 		// Fall back to empty string if detection fails
 		dnsIP = ""
 	}
@@ -60,25 +60,25 @@ log-dhcp
 func (g *ConfigGenerator) GenerateInstanceConfig(instance config.InstanceConfig) string {
 	var sb strings.Builder

-	sb.WriteString(fmt.Sprintf("# DNS configuration for instance: %s\n", instance.Cluster.Name))
-	sb.WriteString(fmt.Sprintf("# Generated by Wild Cloud\n\n"))
+	fmt.Fprintf(&sb, "# DNS configuration for instance: %s\n", instance.Cluster.Name)
+	sb.WriteString("# Generated by Wild Cloud\n\n")

 	loadBalancerIP := instance.Cluster.LoadBalancerIp
 	if loadBalancerIP == "" {
-		sb.WriteString(fmt.Sprintf("# WARNING: No load balancer IP configured for this instance\n"))
-		sb.WriteString(fmt.Sprintf("# DNS entries are commented out until load balancer IP is configured\n\n"))
-		sb.WriteString(fmt.Sprintf("# local=/%s/\n", instance.Cloud.InternalDomain))
-		sb.WriteString(fmt.Sprintf("# address=/%s/<load-balancer-ip>\n\n", instance.Cloud.InternalDomain))
-		sb.WriteString(fmt.Sprintf("# address=/%s/<load-balancer-ip>\n", instance.Cloud.Domain))
+		sb.WriteString("# WARNING: No load balancer IP configured for this instance\n")
+		sb.WriteString("# DNS entries are commented out until load balancer IP is configured\n\n")
+		fmt.Fprintf(&sb, "# local=/%s/\n", instance.Cloud.InternalDomain)
+		fmt.Fprintf(&sb, "# address=/%s/<load-balancer-ip>\n\n", instance.Cloud.InternalDomain)
+		fmt.Fprintf(&sb, "# address=/%s/<load-balancer-ip>\n", instance.Cloud.Domain)
 	} else {
 		// Internal domain (.internal.cloud.example.tld) - local only, no external DNS
-		sb.WriteString(fmt.Sprintf("# Internal domain (LAN-only)\n"))
-		sb.WriteString(fmt.Sprintf("local=/%s/\n", instance.Cloud.InternalDomain))
-		sb.WriteString(fmt.Sprintf("address=/%s/%s\n\n", instance.Cloud.InternalDomain, loadBalancerIP))
+		sb.WriteString("# Internal domain (LAN-only)\n")
+		fmt.Fprintf(&sb, "local=/%s/\n", instance.Cloud.InternalDomain)
+		fmt.Fprintf(&sb, "address=/%s/%s\n\n", instance.Cloud.InternalDomain, loadBalancerIP)

 		// External domain (cloud.example.tld) - resolve to load balancer IP
-		sb.WriteString(fmt.Sprintf("# Public domain (resolved locally to avoid external DNS)\n"))
-		sb.WriteString(fmt.Sprintf("address=/%s/%s\n", instance.Cloud.Domain, loadBalancerIP))
+		sb.WriteString("# Public domain (resolved locally to avoid external DNS)\n")
+		fmt.Fprintf(&sb, "address=/%s/%s\n", instance.Cloud.Domain, loadBalancerIP)
 	}

 	return sb.String()
@@ -129,7 +129,7 @@ func (g *ConfigGenerator) WriteInstanceConfig(instanceName string, instance conf
 		return fmt.Errorf("installing instance config: %w", err)
 	}

-	log.Printf("Successfully wrote instance DNS config: %s", instanceFile)
+	slog.Info("wrote instance DNS config", "component", "dnsmasq", "path", instanceFile)
 	return nil
 }

@@ -151,7 +151,9 @@ func (g *ConfigGenerator) ValidateWithInstance(instanceConfigPath string) error
 	tempMainConfig := filepath.Join(tempDir, "main.conf")
 	// Modify the conf-dir line to point to our temp instance dir
 	tempInstanceDir := filepath.Join(tempDir, "instances")
-	os.MkdirAll(tempInstanceDir, 0755)
+	if err := os.MkdirAll(tempInstanceDir, 0755); err != nil {
+		return fmt.Errorf("creating temp instance dir: %w", err)
+	}

 	modifiedContent := strings.ReplaceAll(
 		string(mainContent),
@@ -184,7 +186,7 @@ func (g *ConfigGenerator) RemoveInstanceConfig(instanceName string) error {

 	// Check if file exists
 	if _, err := os.Stat(instanceFile); os.IsNotExist(err) {
-		log.Printf("Instance DNS config does not exist: %s", instanceFile)
+		slog.Info("instance DNS config does not exist", "component", "dnsmasq", "path", instanceFile)
 		return nil // Not an error, already removed
 	}

@@ -193,7 +195,7 @@ func (g *ConfigGenerator) RemoveInstanceConfig(instanceName string) error {
 		return fmt.Errorf("removing instance config: %w", err)
 	}

-	log.Printf("Removed instance DNS config: %s", instanceFile)
+	slog.Info("removed instance DNS config", "component", "dnsmasq", "path", instanceFile)
 	return nil
 }

@@ -205,16 +207,16 @@ func (g *ConfigGenerator) ReloadService() error {
 	_, err := cmd.CombinedOutput()
 	if err != nil {
 		// If reload fails, try restart as fallback
-		log.Printf("Reload failed, attempting restart: %v", err)
+		slog.Error("reload failed, attempting restart", "component", "dnsmasq", "error", err)
 		return g.RestartService()
 	}
-	log.Printf("Successfully reloaded dnsmasq service")
+	slog.Info("dnsmasq service reloaded", "component", "dnsmasq")
 	return nil
 }

 // UpdateToModularConfig migrates from monolithic to modular configuration
 func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instanceNames []string, instances []config.InstanceConfig) error {
-	log.Printf("Migrating to modular dnsmasq configuration...")
+	slog.Info("migrating to modular configuration", "component", "dnsmasq")

 	// Ensure instance directory exists
 	if err := os.MkdirAll(instanceConfigDir, 0755); err != nil {
@@ -225,7 +227,7 @@ func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instan
 	for i, instance := range instances {
 		instanceName := instanceNames[i]
 		if err := g.WriteInstanceConfig(instanceName, instance); err != nil {
-			log.Printf("Warning: Failed to write instance config for %s: %v", instanceName, err)
+			slog.Error("failed to write instance config", "component", "dnsmasq", "instance", instanceName, "error", err)
 			// Continue with other instances
 		}
 	}
@@ -255,21 +257,21 @@ func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instan
 	// Install new config
 	if err := os.Rename(tempFile, g.configPath); err != nil {
 		// Try to restore backup
-		os.Rename(backupFile, g.configPath)
+		_ = os.Rename(backupFile, g.configPath)
 		return fmt.Errorf("installing new config: %w", err)
 	}

 	// Reload dnsmasq
 	if err := g.ReloadService(); err != nil {
 		// Try to restore backup and reload
-		log.Printf("Reload failed, attempting to restore backup...")
+		slog.Error("reload failed, restoring backup", "component", "dnsmasq", "error", err)
 		os.Remove(g.configPath)
-		os.Rename(backupFile, g.configPath)
-		g.ReloadService()
+		_ = os.Rename(backupFile, g.configPath)
+		_ = g.ReloadService()
 		return fmt.Errorf("reloading with new config: %w", err)
 	}

-	log.Printf("Successfully migrated to modular dnsmasq configuration")
+	slog.Info("migrated to modular configuration", "component", "dnsmasq")
 	return nil
 }

@@ -286,6 +288,6 @@ func (g *ConfigGenerator) UpdateInstanceDNS(instanceName string, instance config
 		return fmt.Errorf("reloading dnsmasq: %w", err)
 	}

-	log.Printf("Successfully updated DNS for instance: %s", instanceName)
+	slog.Info("DNS updated for instance", "component", "dnsmasq", "instance", instanceName)
 	return nil
-}
+}
--- a/api/internal/factory/factory.go
+++ b/api/internal/factory/factory.go
@@ -269,7 +269,7 @@ func ParseVersion(v string) [3]int {
 		v = v[:idx]
 	}
 	var parts [3]int
-	fmt.Sscanf(v, "%d.%d.%d", &parts[0], &parts[1], &parts[2])
+	_, _ = fmt.Sscanf(v, "%d.%d.%d", &parts[0], &parts[1], &parts[2])
 	return parts
 }

--- a/api/internal/instance/instance.go
+++ b/api/internal/instance/instance.go
@@ -2,6 +2,7 @@ package instance

 import (
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"

@@ -74,6 +75,8 @@ func (m *Manager) CreateInstance(name string) error {
 		return nil
 	}

+	slog.Info("creating instance", "component", "instance", "name", name)
+
 	// Acquire lock for instance creation
 	lockPath := tools.GetInstancesLockPath(m.dataDir)
 	return storage.WithLock(lockPath, func() error {
@@ -118,6 +121,8 @@ func (m *Manager) DeleteInstance(name string) error {
 		return fmt.Errorf("instance %s does not exist", name)
 	}

+	slog.Info("deleting instance", "component", "instance", "name", name)
+
 	// Clear context if this is the current instance
 	currentContext, err := m.contextMgr.GetCurrentContext()
 	if err == nil && currentContext == name {
--- a/api/internal/logging/console.go
+++ b/api/internal/logging/console.go
@@ -0,0 +1,138 @@
+package logging
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"slices"
+	"sync"
+)
+
+// ANSI color codes
+const (
+	dim    = "\033[2m"
+	red    = "\033[31m"
+	yellow = "\033[33m"
+	cyan   = "\033[36m"
+	reset  = "\033[0m"
+)
+
+// ConsoleHandler formats log output for human readability on terminals.
+// It produces compact, color-coded lines:
+//
+//	20:15:54 INF daemon started addr=:5055
+//	20:15:54 ERR backup failed component=backup error="connection refused"
+type ConsoleHandler struct {
+	w     io.Writer
+	level slog.Leveler
+	attrs []slog.Attr
+	mu    *sync.Mutex
+}
+
+// NewConsoleHandler creates a handler that writes human-friendly colored logs.
+func NewConsoleHandler(w io.Writer, opts *slog.HandlerOptions) *ConsoleHandler {
+	level := slog.LevelInfo
+	if opts != nil && opts.Level != nil {
+		level = opts.Level.Level()
+	}
+	return &ConsoleHandler{
+		w:     w,
+		level: level,
+		mu:    &sync.Mutex{},
+	}
+}
+
+func (h *ConsoleHandler) Enabled(_ context.Context, level slog.Level) bool {
+	return level >= h.level.Level()
+}
+
+func (h *ConsoleHandler) Handle(_ context.Context, r slog.Record) error {
+	// Time
+	buf := []byte(dim + r.Time.Format("15:04:05") + reset + " ")
+
+	// Level badge
+	switch {
+	case r.Level >= slog.LevelError:
+		buf = append(buf, red+"ERR"+reset+" "...)
+	case r.Level >= slog.LevelWarn:
+		buf = append(buf, yellow+"WRN"+reset+" "...)
+	default:
+		buf = append(buf, cyan+"INF"+reset+" "...)
+	}
+
+	// Message
+	buf = append(buf, r.Message...)
+
+	// Pre-set attrs (from slog.With)
+	for _, a := range h.attrs {
+		buf = appendAttr(buf, a)
+	}
+
+	// Inline attrs
+	r.Attrs(func(a slog.Attr) bool {
+		buf = appendAttr(buf, a)
+		return true
+	})
+
+	buf = append(buf, '\n')
+
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	_, err := h.w.Write(buf)
+	return err
+}
+
+func (h *ConsoleHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
+	return &ConsoleHandler{
+		w:     h.w,
+		level: h.level,
+		attrs: append(slices.Clone(h.attrs), attrs...),
+		mu:    h.mu,
+	}
+}
+
+func (h *ConsoleHandler) WithGroup(name string) slog.Handler {
+	// Groups are rare in this codebase; treat as a prefixed attr set
+	return &ConsoleHandler{
+		w:     h.w,
+		level: h.level,
+		attrs: append(slices.Clone(h.attrs), slog.String("group", name)),
+		mu:    h.mu,
+	}
+}
+
+func appendAttr(buf []byte, a slog.Attr) []byte {
+	if a.Equal(slog.Attr{}) {
+		return buf
+	}
+	v := a.Value.Resolve()
+	buf = append(buf, ' ')
+	buf = append(buf, dim...)
+	buf = append(buf, a.Key...)
+	buf = append(buf, '=')
+	buf = append(buf, reset...)
+
+	s := v.String()
+	if needsQuote(s) {
+		buf = append(buf, fmt.Sprintf("%q", s)...)
+	} else {
+		buf = append(buf, s...)
+	}
+	return buf
+}
+
+func needsQuote(s string) bool {
+	if s == "" {
+		return true
+	}
+	for _, c := range s {
+		if c <= ' ' || c == '"' || c == '\\' {
+			return true
+		}
+	}
+	return false
+}
+
+// Verify interface compliance at compile time.
+var _ slog.Handler = (*ConsoleHandler)(nil)
--- a/api/internal/node/node.go
+++ b/api/internal/node/node.go
@@ -3,15 +3,18 @@ package node
 import (
 	"context"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"

 	"github.com/wild-cloud/wild-central/daemon/internal/config"
 	"github.com/wild-cloud/wild-central/daemon/internal/setup"
 	"github.com/wild-cloud/wild-central/daemon/internal/tools"
+	"gopkg.in/yaml.v3"
 )

 // Manager handles node configuration and state management
@@ -172,6 +175,8 @@ func (m *Manager) Get(instanceName, hostname string) (*Node, error) {

 // Add registers a new node in config.yaml
 func (m *Manager) Add(instanceName string, node *Node) error {
+	slog.Info("adding node", "component", "node", "instance", instanceName, "hostname", node.Hostname, "role", node.Role)
+
 	instancePath := m.GetInstancePath(instanceName)

 	// Validate node data
@@ -263,6 +268,8 @@ func (m *Manager) Add(instanceName string, node *Node) error {
 // Delete removes a node from config.yaml
 // If skipReset is false, the node will be reset before deletion (with 30s timeout)
 func (m *Manager) Delete(instanceName, nodeIdentifier string, skipReset bool) error {
+	slog.Info("deleting node", "component", "node", "instance", instanceName, "node", nodeIdentifier, "skipReset", skipReset)
+
 	// Get node to find hostname
 	node, err := m.Get(instanceName, nodeIdentifier)
 	if err != nil {
@@ -290,6 +297,15 @@ func (m *Manager) Delete(instanceName, nodeIdentifier string, skipReset bool) er
 		}
 	}

+	// Remove the k8s node object (best-effort — node may already be gone)
+	kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
+	cmd := exec.Command("kubectl", "delete", "node", node.Hostname, "--ignore-not-found")
+	tools.WithKubeconfig(cmd, kubeconfigPath)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		slog.Warn("failed to delete k8s node object (continuing)", "component", "node",
+			"hostname", node.Hostname, "error", err, "output", strings.TrimSpace(string(out)))
+	}
+
 	// Delete node from config.yaml
 	return m.deleteFromConfig(instanceName, node.Hostname)
 }
@@ -434,6 +450,8 @@ func (m *Manager) Apply(instanceName, nodeIdentifier string, opts ApplyOptions)
 		return fmt.Errorf("failed to update node status: %w", err)
 	}

+	slog.Info("applying node config", "component", "node", "instance", instanceName, "hostname", node.Hostname, "role", node.Role)
+
 	// Apply configuration to node
 	// Determine which IP to use and whether node is in maintenance mode
 	//
@@ -473,6 +491,7 @@ func (m *Manager) Apply(instanceName, nodeIdentifier string, opts ApplyOptions)
 		return fmt.Errorf("failed to update node status: %w", err)
 	}

+	slog.Info("node config applied", "component", "node", "instance", instanceName, "hostname", node.Hostname, "ip", node.TargetIP)
 	return nil
 }

@@ -499,20 +518,37 @@ func (m *Manager) generateNodePatch(instanceName string, node *Node, setupDir st
 	patchContent = strings.ReplaceAll(patchContent, "{{SCHEMATIC_ID}}", node.SchematicID)
 	patchContent = strings.ReplaceAll(patchContent, "{{VERSION}}", node.Version)

-	// Stage 2: Process through gomplate with config.yaml context (like v.PoC does with wild-compile-template)
-	instancePath := m.GetInstancePath(instanceName)
-	configPath := filepath.Join(instancePath, "config.yaml")
-
-	// Use gomplate to process template with config context
-	cmd := exec.Command("gomplate", "-c", ".="+configPath)
-	cmd.Stdin = strings.NewReader(patchContent)
-
-	output, err := cmd.CombinedOutput()
+	// Stage 2: Process through gomplate with merged global+instance config context
+	merged, err := config.LoadMergedInstanceConfig(m.dataDir, instanceName)
 	if err != nil {
-		return "", fmt.Errorf("failed to process template with gomplate: %w\nOutput: %s", err, string(output))
+		return "", fmt.Errorf("failed to load merged config: %w", err)
 	}

-	processedPatch := string(output)
+	mergedYAML, err := yaml.Marshal(merged)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal merged config: %w", err)
+	}
+
+	gomplate := tools.NewGomplate()
+	processedPatch, err := gomplate.RenderTemplate(patchContent, string(mergedYAML))
+	if err != nil {
+		return "", fmt.Errorf("failed to process template with gomplate: %w", err)
+	}
+
+	// Wipe disk when node needs a fresh install (maintenance or never applied)
+	if node.Maintenance || !node.Applied {
+		var patchMap map[string]interface{}
+		if err := yaml.Unmarshal([]byte(processedPatch), &patchMap); err == nil {
+			if machine, ok := patchMap["machine"].(map[string]interface{}); ok {
+				if install, ok := machine["install"].(map[string]interface{}); ok {
+					install["wipe"] = true
+					if patched, err := yaml.Marshal(patchMap); err == nil {
+						processedPatch = string(patched)
+					}
+				}
+			}
+		}
+	}

 	// Create patch directory
 	patchDir := filepath.Join(setupDir, "patch")
@@ -723,8 +759,111 @@ func (m *Manager) FetchTemplates(instanceName string) error {
 	return m.extractEmbeddedTemplates(destDir)
 }

-// Reset resets a node to maintenance mode
+// NodeHealth represents the health status of a node
+type NodeHealth struct {
+	Node        string               `json:"node"`
+	Services    []tools.ServiceStatus `json:"services"`
+	DmesgErrors []tools.DmesgError   `json:"dmesgErrors"`
+	Healthy     bool                 `json:"healthy"`
+}
+
+// Health checks node health by querying Talos service statuses and scanning dmesg for errors
+func (m *Manager) Health(instanceName, nodeIdentifier string) (*NodeHealth, error) {
+	node, err := m.Get(instanceName, nodeIdentifier)
+	if err != nil {
+		return nil, fmt.Errorf("node not found: %w", err)
+	}
+
+	if !node.Applied || node.Maintenance {
+		return nil, fmt.Errorf("health check requires an applied, non-maintenance node")
+	}
+
+	ip := node.TargetIP
+	if ip == "" {
+		return nil, fmt.Errorf("no IP address available for node %s", node.Hostname)
+	}
+
+	// Fetch services and dmesg concurrently
+	var services []tools.ServiceStatus
+	var dmesgRaw string
+	var svcErr, dmesgErr error
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	go func() {
+		defer wg.Done()
+		services, svcErr = m.talosctl.GetServices(ip)
+	}()
+
+	go func() {
+		defer wg.Done()
+		dmesgRaw, dmesgErr = m.talosctl.GetDmesg(ip)
+	}()
+
+	wg.Wait()
+
+	if svcErr != nil {
+		return nil, fmt.Errorf("failed to get services: %w", svcErr)
+	}
+
+	var dmesgErrors []tools.DmesgError
+	if dmesgErr == nil {
+		dmesgErrors = tools.ParseDmesgErrors(dmesgRaw)
+	}
+	if dmesgErrors == nil {
+		dmesgErrors = []tools.DmesgError{}
+	}
+
+	// Compute overall health
+	healthy := len(dmesgErrors) == 0
+	for _, svc := range services {
+		if !svc.Healthy && svc.HealthMessage != "" {
+			healthy = false
+			break
+		}
+	}
+
+	return &NodeHealth{
+		Node:        node.Hostname,
+		Services:    services,
+		DmesgErrors: dmesgErrors,
+		Healthy:     healthy,
+	}, nil
+}
+
+// Reboot restarts a node without wiping state
+func (m *Manager) Reboot(instanceName, nodeIdentifier string) error {
+	slog.Info("rebooting node", "component", "node", "instance", instanceName, "node", nodeIdentifier)
+
+	node, err := m.Get(instanceName, nodeIdentifier)
+	if err != nil {
+		return fmt.Errorf("node not found: %w", err)
+	}
+
+	rebootIP := node.TargetIP
+	if rebootIP == "" {
+		rebootIP = node.CurrentIP
+	}
+	if rebootIP == "" {
+		return fmt.Errorf("no IP address available for node %s", node.Hostname)
+	}
+
+	if err := m.talosctl.Reboot(rebootIP); err != nil {
+		return fmt.Errorf("failed to reboot node: %w", err)
+	}
+
+	slog.Info("node reboot initiated", "component", "node", "instance", instanceName, "hostname", node.Hostname, "ip", rebootIP)
+	return nil
+}
+
+// Reset resets a node to maintenance mode with resilient handling.
+// For control plane nodes, it first removes the etcd member from the cluster
+// via a healthy peer, then resets the node. The node stays in config.yaml
+// so it can be reconfigured and rejoined.
 func (m *Manager) Reset(instanceName, nodeIdentifier string) error {
+	slog.Info("resetting node", "component", "node", "instance", instanceName, "node", nodeIdentifier)
+
 	// Get node
 	node, err := m.Get(instanceName, nodeIdentifier)
 	if err != nil {
@@ -736,35 +875,75 @@ func (m *Manager) Reset(instanceName, nodeIdentifier string) error {
 	if resetIP == "" {
 		resetIP = node.TargetIP
 	}
+	if resetIP == "" {
+		return fmt.Errorf("no IP address available for node %s", node.Hostname)
+	}

-	// Execute reset command with graceful=false and reboot flags
-	talosconfigPath := tools.GetTalosconfigPath(m.dataDir, instanceName)
-	cmd := exec.Command("talosctl", "-n", resetIP, "--talosconfig", talosconfigPath, "reset", "--graceful=false", "--reboot")
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		// Check if error is due to node rebooting (expected after reset command)
-		outputStr := string(output)
-		if strings.Contains(outputStr, "connection refused") || strings.Contains(outputStr, "Unavailable") {
-			// This is expected - node is rebooting after successful reset
-			// Continue with config cleanup
-		} else {
-			// Real error - return it
-			return fmt.Errorf("failed to reset node: %w\nOutput: %s", err, outputStr)
+	// For control plane nodes, remove from etcd first via a healthy peer
+	if node.Role == "controlplane" {
+		if err := m.removeEtcdMember(instanceName, node); err != nil {
+			// Log but don't fail — the node may already be removed from etcd,
+			// or etcd may be unreachable on this node. The reset should still proceed.
+			slog.Warn("etcd member removal failed (continuing with reset)", "component", "node",
+				"instance", instanceName, "hostname", node.Hostname, "error", err)
 		}
 	}

-	// Update node status to maintenance mode, then remove from config
+	// Reset the node via talosctl
+	if err := m.talosctl.Reset(resetIP); err != nil {
+		return fmt.Errorf("failed to reset node: %w", err)
+	}
+
+	// Update node status to maintenance mode but keep in config for reconfiguration
 	node.Maintenance = true
 	node.Configured = false
 	node.Applied = false
+	node.CurrentIP = ""
+	node.Version = ""
 	if err := m.updateNodeStatus(instanceName, node); err != nil {
 		return fmt.Errorf("failed to update node status: %w", err)
 	}

-	// Remove node from config.yaml after successful reset
-	if err := m.deleteFromConfig(instanceName, node.Hostname); err != nil {
-		return fmt.Errorf("failed to remove node from config: %w", err)
-	}
-
+	slog.Info("node reset to maintenance mode", "component", "node",
+		"instance", instanceName, "hostname", node.Hostname)
 	return nil
 }
+
+// removeEtcdMember removes a control plane node's etcd member via a healthy peer.
+func (m *Manager) removeEtcdMember(instanceName string, targetNode *Node) error {
+	// Find a healthy control plane peer to run the etcd remove-member command
+	nodes, err := m.List(instanceName)
+	if err != nil {
+		return fmt.Errorf("failed to list nodes: %w", err)
+	}
+
+	for _, peer := range nodes {
+		// Skip the target node itself, non-control-plane nodes, and unapplied nodes
+		if peer.Hostname == targetNode.Hostname || peer.Role != "controlplane" || !peer.Applied || peer.Maintenance {
+			continue
+		}
+
+		peerIP := peer.CurrentIP
+		if peerIP == "" {
+			peerIP = peer.TargetIP
+		}
+		if peerIP == "" {
+			continue
+		}
+
+		slog.Info("removing etcd member via peer", "component", "node",
+			"instance", instanceName, "target", targetNode.Hostname, "peer", peer.Hostname)
+
+		if err := m.talosctl.EtcdRemoveMember(peerIP, targetNode.Hostname); err != nil {
+			slog.Warn("etcd remove-member failed via peer, trying next", "component", "node",
+				"peer", peer.Hostname, "error", err)
+			continue
+		}
+
+		slog.Info("etcd member removed successfully", "component", "node",
+			"instance", instanceName, "target", targetNode.Hostname, "via", peer.Hostname)
+		return nil
+	}
+
+	return fmt.Errorf("no healthy control plane peer available to remove etcd member for %s", targetNode.Hostname)
+}
--- a/api/internal/operations/operations.go
+++ b/api/internal/operations/operations.go
@@ -3,6 +3,7 @@ package operations
 import (
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"os"
 	"path/filepath"
 	"time"
@@ -72,8 +73,8 @@ type Operation struct {
 	Progress  int               `json:"progress"`          // 0-100
 	Details   *OperationDetails `json:"details,omitempty"` // Operation-specific details
 	LogFile   string            `json:"logFile,omitempty"` // Path to output log file
-	StartedAt time.Time  `json:"started_at"`
-	EndedAt   *time.Time `json:"ended_at,omitempty"`
+	StartedAt time.Time         `json:"started_at"`
+	EndedAt   *time.Time        `json:"ended_at,omitempty"`
 }

 // GetOperationsDir returns the operations directory for an instance
@@ -115,6 +116,8 @@ func (m *Manager) Start(instanceName, opType, target string) (string, error) {
 		return "", err
 	}

+	slog.Info("operation started", "component", "operations", "id", opID, "type", opType, "target", target, "instance", instanceName)
+
 	// Broadcast SSE event if manager is available
 	m.broadcastOperationEvent("operation:started", op)

@@ -164,6 +167,18 @@ func (m *Manager) Update(instanceName, opID, status, message string, progress in
 		return err
 	}

+	// Log terminal status transitions
+	if oldStatus != status {
+		switch status {
+		case "completed":
+			slog.Info("operation completed", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
+		case "failed":
+			slog.Error("operation failed", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName, "error", message)
+		case "cancelled":
+			slog.Info("operation cancelled", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
+		}
+	}
+
 	// Broadcast appropriate SSE event based on status change
 	if oldStatus != status {
 		switch status {
@@ -302,6 +317,26 @@ func (m *Manager) Delete(instanceName, opID string) error {
 	return os.Remove(opPath)
 }

+// FailOrphaned marks all running/pending operations for an instance as failed.
+// Called on API startup to clean up operations that were interrupted by a restart.
+func (m *Manager) FailOrphaned(instanceName string) error {
+	ops, err := m.List(instanceName)
+	if err != nil {
+		return err
+	}
+
+	for _, op := range ops {
+		if op.Status == "running" || op.Status == "pending" {
+			slog.Info("failing orphaned operation", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
+			if err := m.Update(instanceName, op.ID, "failed", "API restarted while operation was in progress", op.Progress); err != nil {
+				slog.Warn("failed to mark orphaned operation as failed", "component", "operations", "id", op.ID, "error", err)
+			}
+		}
+	}
+
+	return nil
+}
+
 // Cleanup removes old completed/failed operations
 func (m *Manager) Cleanup(instanceName string, olderThan time.Duration) error {
 	ops, err := m.List(instanceName)
--- a/api/internal/operations/operations_test.go
+++ b/api/internal/operations/operations_test.go
@@ -0,0 +1,90 @@
+package operations
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func setupTestManager(t *testing.T) (*Manager, string) {
+	t.Helper()
+	tmpDir := t.TempDir()
+	instanceName := "test-cloud"
+
+	// Create the instances/test-cloud/operations directory
+	opsDir := filepath.Join(tmpDir, "instances", instanceName, "operations")
+	if err := os.MkdirAll(opsDir, 0755); err != nil {
+		t.Fatalf("failed to create ops dir: %v", err)
+	}
+
+	return NewManager(tmpDir), instanceName
+}
+
+func TestFailOrphaned(t *testing.T) {
+	m, instanceName := setupTestManager(t)
+
+	// Create operations in various states
+	runningID, err := m.Start(instanceName, "backup", "myapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+	_ = m.Update(instanceName, runningID, "running", "Backing up", 50)
+
+	pendingID, err := m.Start(instanceName, "restore", "myapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+
+	completedID, err := m.Start(instanceName, "backup", "otherapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+	_ = m.Update(instanceName, completedID, "completed", "Done", 100)
+
+	failedID, err := m.Start(instanceName, "deploy", "otherapp")
+	if err != nil {
+		t.Fatalf("failed to start operation: %v", err)
+	}
+	_ = m.Update(instanceName, failedID, "failed", "Something broke", 0)
+
+	// Run FailOrphaned
+	if err := m.FailOrphaned(instanceName); err != nil {
+		t.Fatalf("FailOrphaned failed: %v", err)
+	}
+
+	// Running operation should now be failed
+	op, _ := m.GetByInstance(instanceName, runningID)
+	if op.Status != "failed" {
+		t.Errorf("expected running op to be failed, got %s", op.Status)
+	}
+	if op.EndedAt == nil {
+		t.Error("expected running op to have EndedAt set")
+	}
+
+	// Pending operation should now be failed
+	op, _ = m.GetByInstance(instanceName, pendingID)
+	if op.Status != "failed" {
+		t.Errorf("expected pending op to be failed, got %s", op.Status)
+	}
+
+	// Completed operation should be unchanged
+	op, _ = m.GetByInstance(instanceName, completedID)
+	if op.Status != "completed" {
+		t.Errorf("expected completed op to stay completed, got %s", op.Status)
+	}
+
+	// Failed operation should be unchanged
+	op, _ = m.GetByInstance(instanceName, failedID)
+	if op.Status != "failed" {
+		t.Errorf("expected already-failed op to stay failed, got %s", op.Status)
+	}
+}
+
+func TestFailOrphaned_NoOperations(t *testing.T) {
+	m, instanceName := setupTestManager(t)
+
+	// Should not error on empty operations directory
+	if err := m.FailOrphaned(instanceName); err != nil {
+		t.Fatalf("FailOrphaned on empty dir failed: %v", err)
+	}
+}
--- a/api/internal/pxe/pxe.go
+++ b/api/internal/pxe/pxe.go
@@ -1,9 +1,9 @@
 package pxe

 import (
-	"crypto/sha256"
 	"fmt"
 	"io"
+	"log/slog"
 	"net/http"
 	"os"
 	"path/filepath"
@@ -72,10 +72,6 @@ func (m *Manager) ListAssets(instanceName string) ([]Asset, error) {

 		if err == nil {
 			asset.Size = info.Size()
-			// Calculate SHA256 if file exists
-			if hash, err := calculateSHA256(assetPath); err == nil {
-				asset.SHA256 = hash
-			}
 		}

 		assets = append(assets, asset)
@@ -145,6 +141,7 @@ func (m *Manager) DownloadAsset(instanceName, assetType, version, url string) er
 		return fmt.Errorf("failed to move file: %w", err)
 	}

+	slog.Info("PXE asset downloaded", "component", "pxe", "instance", instanceName, "type", assetType, "version", version)
 	return nil
 }

@@ -204,18 +201,3 @@ func (m *Manager) DeleteAsset(instanceName, assetType string) error {
 	return os.Remove(assetPath)
 }

-// calculateSHA256 computes the SHA256 hash of a file
-func calculateSHA256(filePath string) (string, error) {
-	file, err := os.Open(filePath)
-	if err != nil {
-		return "", err
-	}
-	defer file.Close()
-
-	hash := sha256.New()
-	if _, err := io.Copy(hash, file); err != nil {
-		return "", err
-	}
-
-	return fmt.Sprintf("%x", hash.Sum(nil)), nil
-}
--- a/api/internal/secrets/secrets_test.go
+++ b/api/internal/secrets/secrets_test.go
@@ -90,11 +90,8 @@ func TestGenerateSecret_Uniqueness(t *testing.T) {
 // Test: NewManager creates manager successfully
 func TestNewManager(t *testing.T) {
 	m := NewManager()
-	if m == nil {
-		t.Fatal("NewManager returned nil")
-	}
-	if m.yq == nil {
-		t.Error("Manager.yq is nil")
+	if m == nil || m.yq == nil {
+		t.Fatal("NewManager returned nil or Manager.yq is nil")
 	}
 }

--- a/api/internal/sse/manager.go
+++ b/api/internal/sse/manager.go
@@ -4,7 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"sync"
 	"time"

@@ -42,7 +42,6 @@ type EventFilters struct {
 // Manager manages all SSE connections
 type Manager struct {
 	clients      map[string]map[string]*Client // instanceName -> clientID -> Client
-	register     chan *Client
 	unregister   chan *Client
 	broadcast    chan *Event
 	mu           sync.RWMutex
@@ -53,7 +52,6 @@ type Manager struct {
 func NewManager() *Manager {
 	m := &Manager{
 		clients:      make(map[string]map[string]*Client),
-		register:     make(chan *Client, 100),
 		unregister:   make(chan *Client, 100),
 		broadcast:    make(chan *Event, 1000),
 		rateLimiters: make(map[string]*rate.Limiter),
@@ -62,19 +60,10 @@ func NewManager() *Manager {
 	return m
 }

-// run processes client registration and event broadcasting
+// run processes client unregistration and event broadcasting
 func (m *Manager) run() {
 	for {
 		select {
-		case client := <-m.register:
-			m.mu.Lock()
-			if m.clients[client.InstanceName] == nil {
-				m.clients[client.InstanceName] = make(map[string]*Client)
-			}
-			m.clients[client.InstanceName][client.ID] = client
-			m.mu.Unlock()
-			log.Printf("SSE: Client %s registered for instance %s", client.ID, client.InstanceName)
-
 		case client := <-m.unregister:
 			m.mu.Lock()
 			if clients, ok := m.clients[client.InstanceName]; ok {
@@ -85,7 +74,7 @@ func (m *Manager) run() {
 			}
 			close(client.Channel)
 			m.mu.Unlock()
-			log.Printf("SSE: Client %s unregistered", client.ID)
+			slog.Info("client unregistered", "component", "sse", "client", client.ID)

 		case event := <-m.broadcast:
 			m.mu.RLock()
@@ -102,7 +91,7 @@ func (m *Manager) run() {
 					case client.Channel <- event:
 					default:
 						// Client channel full, skip
-						log.Printf("SSE: Client %s channel full, skipping event", client.ID)
+						slog.Info("client channel full, skipping event", "component", "sse", "client", client.ID)
 					}
 				}
 			}
@@ -114,7 +103,7 @@ func (m *Manager) run() {
 					case client.Channel <- event:
 					default:
 						// Client channel full, skip
-						log.Printf("SSE: Client %s channel full, skipping event", client.ID)
+						slog.Info("client channel full, skipping event", "component", "sse", "client", client.ID)
 					}
 				}
 			}
@@ -207,7 +196,14 @@ func (m *Manager) RegisterClient(instanceName string, filters EventFilters) *Cli
 		Cancel:       cancel,
 	}

-	m.register <- client
+	m.mu.Lock()
+	if m.clients[instanceName] == nil {
+		m.clients[instanceName] = make(map[string]*Client)
+	}
+	m.clients[instanceName][client.ID] = client
+	m.mu.Unlock()
+
+	slog.Info("client registered", "component", "sse", "client", client.ID, "instance", instanceName)
 	return client
 }

@@ -230,7 +226,7 @@ func (m *Manager) Broadcast(event *Event) {
 	select {
 	case m.broadcast <- event:
 	default:
-		log.Printf("SSE: Broadcast channel full, dropping event %s", event.ID)
+		slog.Error("broadcast channel full, dropping event", "component", "sse", "event", event.ID, "type", event.Type, "instance", event.InstanceName)
 	}
 }

@@ -269,4 +265,4 @@ func generateEventID() string {
 // JSON marshals the event to JSON
 func (e *Event) JSON() ([]byte, error) {
 	return json.Marshal(e)
-}
+}
--- a/api/internal/sse/manager_test.go
+++ b/api/internal/sse/manager_test.go
@@ -349,4 +349,4 @@ func BenchmarkBroadcast(b *testing.B) {
 	for _, client := range clients {
 		manager.UnregisterClient(client)
 	}
-}
+}
--- a/api/internal/sse/watchers.go
+++ b/api/internal/sse/watchers.go
@@ -5,7 +5,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"log"
+	"log/slog"
 	"os/exec"
 	"strings"
 	"sync"
@@ -120,7 +120,7 @@ func (w *KubectlWatcher) Start() error {
 	w.wg.Add(1)
 	go w.watchResource("services", w.parseServiceEvent)

-	log.Printf("SSE: Started kubectl watchers for instance %s", w.instanceName)
+	slog.Info("started kubectl watchers", "component", "sse", "instance", w.instanceName)
 	return nil
 }

@@ -148,13 +148,13 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,

 		stdout, err := cmd.StdoutPipe()
 		if err != nil {
-			log.Printf("SSE: Failed to create stdout pipe for %s watch: %v", resourceType, err)
+			slog.Error("failed to create stdout pipe", "component", "sse", "resource", resourceType, "error", err)
 			w.handleWatchError(resourceType)
 			continue
 		}

 		if err := cmd.Start(); err != nil {
-			log.Printf("SSE: Failed to start %s watch: %v", resourceType, err)
+			slog.Error("failed to start watch", "component", "sse", "resource", resourceType, "error", err)
 			w.handleWatchError(resourceType)
 			continue
 		}
@@ -170,14 +170,14 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
 		}

 		if err := scanner.Err(); err != nil {
-			log.Printf("SSE: %s watch scanner error: %v", resourceType, err)
+			slog.Error("watch scanner error", "component", "sse", "resource", resourceType, "error", err)
 		}

-		cmd.Wait()
+		_ = cmd.Wait()

 		// If context not cancelled, restart after a delay
 		if w.ctx.Err() == nil {
-			log.Printf("SSE: Restarting %s watcher for instance %s", resourceType, w.instanceName)
+			slog.Info("restarting watcher", "component", "sse", "resource", resourceType, "instance", w.instanceName)
 			time.Sleep(5 * time.Second)
 		}
 	}
@@ -186,7 +186,7 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
 // parsePodEvent parses pod watch events
 func (w *KubectlWatcher) parsePodEvent(data []byte, resourceType string) {
 	var event struct {
-		Type   string `json:"type"`   // ADDED, MODIFIED, DELETED
+		Type   string `json:"type"` // ADDED, MODIFIED, DELETED
 		Object struct {
 			Metadata struct {
 				Name      string            `json:"name"`
@@ -503,7 +503,7 @@ func (w *KubectlWatcher) handleWatchError(resourceType string) {
 func (w *KubectlWatcher) Stop() {
 	w.cancel()
 	w.wg.Wait()
-	log.Printf("SSE: Stopped kubectl watchers for instance %s", w.instanceName)
+	slog.Info("stopped kubectl watchers", "component", "sse", "instance", w.instanceName)
 }

 // TalosWatcher watches Talos events using talosctl
@@ -532,7 +532,7 @@ func NewTalosWatcher(instanceName, talosconfig, nodeIP string, manager *Manager)
 // Start begins watching Talos events
 func (w *TalosWatcher) Start() error {
 	go w.watchEvents()
-	log.Printf("SSE: Started talos watcher for instance %s", w.instanceName)
+	slog.Info("started talos watcher", "component", "sse", "instance", w.instanceName)
 	return nil
 }

@@ -557,13 +557,13 @@ func (w *TalosWatcher) watchEvents() {

 		stdout, err := cmd.StdoutPipe()
 		if err != nil {
-			log.Printf("SSE: Failed to create stdout pipe for Talos events: %v", err)
+			slog.Error("failed to create stdout pipe for talos events", "component", "sse", "instance", w.instanceName, "nodeIP", w.nodeIP, "error", err)
 			time.Sleep(10 * time.Second)
 			continue
 		}

 		if err := cmd.Start(); err != nil {
-			log.Printf("SSE: Failed to start Talos event watch: %v", err)
+			slog.Error("failed to start talos event watch", "component", "sse", "instance", w.instanceName, "nodeIP", w.nodeIP, "error", err)
 			time.Sleep(10 * time.Second)
 			continue
 		}
@@ -599,11 +599,11 @@ func (w *TalosWatcher) watchEvents() {
 			}
 		}

-		cmd.Wait()
+		_ = cmd.Wait()

 		// If context not cancelled, restart after a delay
 		if w.ctx.Err() == nil {
-			log.Printf("SSE: Restarting talos watcher for instance %s", w.instanceName)
+			slog.Info("restarting talos watcher", "component", "sse", "instance", w.instanceName)
 			time.Sleep(10 * time.Second)
 		}
 	}
@@ -612,5 +612,5 @@ func (w *TalosWatcher) watchEvents() {
 // Stop stops the watcher
 func (w *TalosWatcher) Stop() {
 	w.cancel()
-	log.Printf("SSE: Stopped talos watcher for instance %s", w.instanceName)
-}
+	slog.Info("stopped talos watcher", "component", "sse", "instance", w.instanceName)
+}
--- a/api/internal/sse/watchers_test.go
+++ b/api/internal/sse/watchers_test.go
@@ -431,4 +431,4 @@ func BenchmarkJSONParsing(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		watcher.parsePodEvent([]byte(podJSON), "test-instance")
 	}
-}
+}
--- a/api/internal/tools/gomplate.go
+++ b/api/internal/tools/gomplate.go
@@ -95,6 +95,23 @@ func (g *Gomplate) RenderWithContext(templatePath, outputPath string, context ma
 	return nil
 }

+// RenderTemplate renders an inline template with context YAML piped via stdin.
+// This avoids temp files — context is passed as stdin, template as -i flag.
+func (g *Gomplate) RenderTemplate(template, contextYAML string) (string, error) {
+	cmd := exec.Command(g.gomplatePath, "-i", template, "-c", ".=stdin:///in.yaml")
+	cmd.Stdin = strings.NewReader(contextYAML)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("gomplate render template failed: %w, stderr: %s", err, stderr.String())
+	}
+
+	return stdout.String(), nil
+}
+
 // Exec executes gomplate with arbitrary arguments
 func (g *Gomplate) Exec(args ...string) (string, error) {
 	cmd := exec.Command(g.gomplatePath, args...)
--- a/api/internal/tools/gomplate_stdin_test.go
+++ b/api/internal/tools/gomplate_stdin_test.go
@@ -0,0 +1,372 @@
+package tools
+
+import (
+	"bytes"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+// This test suite validates that gomplate can receive context data via stdin
+// (stdin:///in.yaml) with template via -i flag, producing identical results
+// to the current temp-file approach. Each test mirrors a real usage pattern
+// in the codebase.
+
+// renderTemplateViaStdin is the NEW pattern: context via stdin, template via -i.
+func renderTemplateViaStdin(t *testing.T, gomplatePath, template, contextYAML string) (string, error) {
+	t.Helper()
+
+	cmd := exec.Command(gomplatePath, "-i", template, "-c", ".=stdin:///in.yaml")
+	cmd.Stdin = strings.NewReader(contextYAML)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		return "", &testError{err: err, stderr: stderr.String()}
+	}
+	return strings.TrimSpace(stdout.String()), nil
+}
+
+// renderTemplateViaFile is the CURRENT pattern: context via temp file, template via -i.
+func renderTemplateViaFile(t *testing.T, gomplatePath, template, contextYAML string) (string, error) {
+	t.Helper()
+
+	tmpFile := filepath.Join(t.TempDir(), "context.yaml")
+	if err := os.WriteFile(tmpFile, []byte(contextYAML), 0644); err != nil {
+		t.Fatalf("failed to write temp file: %v", err)
+	}
+
+	cmd := exec.Command(gomplatePath, "-i", template, "-c", ".="+tmpFile)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		return "", &testError{err: err, stderr: stderr.String()}
+	}
+	return strings.TrimSpace(stdout.String()), nil
+}
+
+type testError struct {
+	err    error
+	stderr string
+}
+
+func (e *testError) Error() string {
+	return e.err.Error() + ": " + e.stderr
+}
+
+func getGomplatePath(t *testing.T) string {
+	t.Helper()
+	path, err := exec.LookPath("gomplate")
+	if err != nil {
+		t.Skip("gomplate not installed, skipping test")
+	}
+	return path
+}
+
+// assertEquivalent runs both stdin and file approaches and verifies identical output.
+func assertEquivalent(t *testing.T, gomplatePath, name, template, context, expected string) {
+	t.Helper()
+	t.Run(name, func(t *testing.T) {
+		fileResult, err := renderTemplateViaFile(t, gomplatePath, template, context)
+		if err != nil {
+			t.Fatalf("file-based render failed: %v", err)
+		}
+
+		stdinResult, err := renderTemplateViaStdin(t, gomplatePath, template, context)
+		if err != nil {
+			t.Fatalf("stdin-based render failed: %v", err)
+		}
+
+		if fileResult != stdinResult {
+			t.Errorf("file vs stdin differ:\n  file:  %q\n  stdin: %q", fileResult, stdinResult)
+		}
+
+		if expected != "" && stdinResult != expected {
+			t.Errorf("unexpected result:\n  got:  %q\n  want: %q", stdinResult, expected)
+		}
+	})
+}
+
+// --- Pattern 1: processSecretTemplate (apps.go:527) ---
+// Inline template from manifest defaultSecrets, context is full config with app+secrets keys.
+
+func TestStdinContext_SecretTemplate(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	template := "postgresql://{{ .app.db.user }}:{{ .secrets.dbPassword }}@{{ .app.db.host }}:5432/{{ .app.db.name }}"
+	context := `cloud:
+  domain: cloud.example.com
+cluster:
+  name: test
+apps:
+  myapp:
+    db:
+      host: postgres.postgres.svc.cluster.local
+      name: myapp
+      user: myapp
+app:
+  db:
+    host: postgres.postgres.svc.cluster.local
+    name: myapp
+    user: myapp
+secrets:
+  dbPassword: supersecret
+`
+	assertEquivalent(t, gp, "db-url", template, context,
+		"postgresql://myapp:supersecret@postgres.postgres.svc.cluster.local:5432/myapp")
+}
+
+// --- Pattern 2: processValueNode (apps.go:2412) ---
+// Single scalar values from manifest defaultConfig.
+
+func TestStdinContext_ValueTemplates(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	context := `operator:
+  email: test@example.com
+cloud:
+  domain: cloud.example.com
+  router:
+    ip: 192.168.1.1
+cluster:
+  name: test
+apps:
+  postgres:
+    host: postgres.postgres.svc.cluster.local
+  myapp:
+    namespace: myapp
+app:
+  namespace: myapp
+`
+
+	assertEquivalent(t, gp, "simple field", "{{ .cloud.domain }}", context, "cloud.example.com")
+	assertEquivalent(t, gp, "string concat", "immich.{{ .cloud.domain }}", context, "immich.cloud.example.com")
+	assertEquivalent(t, gp, "cross-app ref", "{{ .apps.postgres.host }}", context, "postgres.postgres.svc.cluster.local")
+	assertEquivalent(t, gp, "app self-ref", "{{ .app.namespace }}", context, "myapp")
+	assertEquivalent(t, gp, "global router ip", "{{ .cloud.router.ip }}", context, "192.168.1.1")
+	assertEquivalent(t, gp, "global operator email", "{{ .operator.email }}", context, "test@example.com")
+}
+
+// --- Pattern 3: generateNodePatch (node.go:529) ---
+// Multi-line YAML template with index function and nested map access.
+
+func TestStdinContext_NodePatchTemplate(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	template := `machine:
+  install:
+    disk: {{ index .cluster.nodes.active "test-node" "disk" }}
+    image: factory.talos.dev/metal-installer/{{ .cluster.nodes.talos.schematicId }}:{{ .cluster.nodes.talos.version }}
+  network:
+    hostname: "test-node"
+    interfaces:
+      - interface: {{ index .cluster.nodes.active "test-node" "interface" }}
+        addresses:
+          - "192.168.1.100/24"
+        routes:
+          - network: 0.0.0.0/0
+            gateway: {{ .cloud.router.ip }}`
+
+	context := `operator:
+  email: test@example.com
+cloud:
+  router:
+    ip: 192.168.1.1
+cluster:
+  name: test-cloud
+  nodes:
+    talos:
+      version: v1.13.0
+      schematicId: abc123def456
+    control:
+      vip: 192.168.1.100
+    active:
+      test-node:
+        role: controlplane
+        interface: eth0
+        disk: /dev/sda
+`
+
+	fileResult, err := renderTemplateViaFile(t, gp, template, context)
+	if err != nil {
+		t.Fatalf("file-based render failed: %v", err)
+	}
+
+	stdinResult, err := renderTemplateViaStdin(t, gp, template, context)
+	if err != nil {
+		t.Fatalf("stdin-based render failed: %v", err)
+	}
+
+	if fileResult != stdinResult {
+		t.Errorf("file vs stdin differ:\n  file:\n%s\n  stdin:\n%s", fileResult, stdinResult)
+	}
+
+	// Verify specific values in the output
+	checks := map[string]string{
+		"gateway":   "gateway: 192.168.1.1",
+		"disk":      "disk: /dev/sda",
+		"interface": "interface: eth0",
+		"image":     "abc123def456:v1.13.0",
+	}
+	for name, substr := range checks {
+		if !strings.Contains(stdinResult, substr) {
+			t.Errorf("missing %s (%q) in output:\n%s", name, substr, stdinResult)
+		}
+	}
+}
+
+// --- Pattern 4: gomplate-specific functions (random.AlphaNum) ---
+
+func TestStdinContext_GomplateRandomFunction(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	template := "{{ random.AlphaNum 32 }}"
+	context := "{}"
+
+	result, err := renderTemplateViaStdin(t, gp, template, context)
+	if err != nil {
+		t.Fatalf("render failed: %v", err)
+	}
+
+	if len(result) != 32 {
+		t.Errorf("expected 32-char random string, got %d chars: %q", len(result), result)
+	}
+}
+
+// --- Pattern 5: if/eq conditionals ---
+
+func TestStdinContext_Conditional(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	template := `{{ if eq .smtp.tls "true" }}tls{{ else }}none{{ end }}`
+
+	assertEquivalent(t, gp, "true case", template, `smtp: {tls: "true"}`, "tls")
+	assertEquivalent(t, gp, "false case", template, `smtp: {tls: "false"}`, "none")
+}
+
+// --- Pattern 6: Empty/minimal context ---
+
+func TestStdinContext_EmptyContext(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	assertEquivalent(t, gp, "static text", "hello world", "{}", "hello world")
+}
+
+// --- Pattern 7: Merged global+instance config scenario ---
+// Tests the actual use case: global config (router, operator) merged with
+// instance config (cloud domain, cluster, apps).
+
+func TestStdinContext_MergedConfigScenario(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	// This is what LoadMergedInstanceConfig would produce
+	context := `operator:
+  email: paul@example.com
+cloud:
+  router:
+    ip: 192.168.8.1
+    dynamicDns: example.ddns.net
+  domain: cloud.example.com
+  internalDomain: internal.cloud.example.com
+  baseDomain: example.com
+cluster:
+  name: test-cloud
+  loadBalancerIp: 192.168.8.80
+  nodes:
+    control:
+      vip: 192.168.8.30
+apps:
+  metallb:
+    loadBalancerIp: 192.168.8.80
+`
+
+	assertEquivalent(t, gp, "global: router ip", "{{ .cloud.router.ip }}", context, "192.168.8.1")
+	assertEquivalent(t, gp, "global: operator email", "{{ .operator.email }}", context, "paul@example.com")
+	assertEquivalent(t, gp, "global: dynamic dns", "{{ .cloud.router.dynamicDns }}", context, "example.ddns.net")
+	assertEquivalent(t, gp, "instance: cloud domain", "{{ .cloud.domain }}", context, "cloud.example.com")
+	assertEquivalent(t, gp, "instance: internal domain", "{{ .cloud.internalDomain }}", context, "internal.cloud.example.com")
+	assertEquivalent(t, gp, "instance: cluster name", "{{ .cluster.name }}", context, "test-cloud")
+	assertEquivalent(t, gp, "instance: app config", "{{ .apps.metallb.loadBalancerIp }}", context, "192.168.8.80")
+	assertEquivalent(t, gp, "mixed: compose both", "https://{{ .cloud.domain }} via {{ .cloud.router.ip }}", context, "https://cloud.example.com via 192.168.8.1")
+}
+
+// --- Pattern 8: Multi-line output preservation ---
+
+func TestStdinContext_MultilineOutput(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	template := `line1: {{ .a }}
+line2: {{ .b }}
+nested:
+  line3: {{ .c }}`
+	context := `a: alpha
+b: beta
+c: gamma
+`
+
+	fileResult, err := renderTemplateViaFile(t, gp, template, context)
+	if err != nil {
+		t.Fatalf("file render failed: %v", err)
+	}
+
+	stdinResult, err := renderTemplateViaStdin(t, gp, template, context)
+	if err != nil {
+		t.Fatalf("stdin render failed: %v", err)
+	}
+
+	if fileResult != stdinResult {
+		t.Errorf("file vs stdin differ:\n  file:\n%s\n  stdin:\n%s", fileResult, stdinResult)
+	}
+
+	if !strings.Contains(stdinResult, "line1: alpha") {
+		t.Error("line1 not rendered correctly")
+	}
+	if !strings.Contains(stdinResult, "line2: beta") {
+		t.Error("line2 not rendered correctly")
+	}
+	if !strings.Contains(stdinResult, "line3: gamma") {
+		t.Error("line3 not rendered correctly")
+	}
+}
+
+// --- Pattern 9: Large context (realistic instance config size) ---
+// Ensures stdin handles larger YAML payloads without issues.
+
+func TestStdinContext_LargeContext(t *testing.T) {
+	gp := getGomplatePath(t)
+
+	// Build a realistic-sized context with many apps
+	var sb strings.Builder
+	sb.WriteString("cloud:\n  domain: cloud.example.com\n  router:\n    ip: 192.168.1.1\n")
+	sb.WriteString("cluster:\n  name: test\n")
+	sb.WriteString("apps:\n")
+	for i := 0; i < 20; i++ {
+		name := "app" + string(rune('a'+i))
+		sb.WriteString("  " + name + ":\n")
+		sb.WriteString("    namespace: " + name + "\n")
+		sb.WriteString("    domain: " + name + ".cloud.example.com\n")
+		sb.WriteString("    storage: 10Gi\n")
+		sb.WriteString("    db:\n")
+		sb.WriteString("      host: postgres.svc.cluster.local\n")
+		sb.WriteString("      port: \"5432\"\n")
+		sb.WriteString("      name: " + name + "db\n")
+		sb.WriteString("      user: " + name + "user\n")
+	}
+
+	template := "{{ .cloud.domain }} {{ .cloud.router.ip }}"
+	result, err := renderTemplateViaStdin(t, gp, template, sb.String())
+	if err != nil {
+		t.Fatalf("render failed: %v", err)
+	}
+
+	if result != "cloud.example.com 192.168.1.1" {
+		t.Errorf("unexpected result: %q", result)
+	}
+}
--- a/api/internal/tools/kubectl_test.go
+++ b/api/internal/tools/kubectl_test.go
@@ -25,8 +25,7 @@ func TestNewKubectl(t *testing.T) {
 			k := NewKubectl(tt.kubeconfigPath)
 			if k == nil {
 				t.Fatal("NewKubectl() returned nil")
-			}
-			if k.kubeconfigPath != tt.kubeconfigPath {
+			} else if k.kubeconfigPath != tt.kubeconfigPath {
 				t.Errorf("kubeconfigPath = %q, want %q", k.kubeconfigPath, tt.kubeconfigPath)
 			}
 		})
@@ -209,9 +208,8 @@ func TestKubectlGetDeployment(t *testing.T) {
 			if err == nil {
 				if depInfo == nil {
 					t.Fatal("GetDeployment() returned nil without error")
-				}
-				// Desired should be non-negative
-				if depInfo.Desired < 0 {
+				} else if depInfo.Desired < 0 {
+					// Desired should be non-negative
 					t.Errorf("Desired = %d, should be non-negative", depInfo.Desired)
 				}
 			}
@@ -244,19 +242,19 @@ func TestKubectlGetReplicas(t *testing.T) {
 			if err == nil {
 				if replicaInfo == nil {
 					t.Fatal("GetReplicas() returned nil without error")
-				}
-				// All values should be non-negative
-				if replicaInfo.Desired < 0 {
-					t.Error("Desired < 0")
-				}
-				if replicaInfo.Current < 0 {
-					t.Error("Current < 0")
-				}
-				if replicaInfo.Ready < 0 {
-					t.Error("Ready < 0")
-				}
-				if replicaInfo.Available < 0 {
-					t.Error("Available < 0")
+				} else {
+					if replicaInfo.Desired < 0 {
+						t.Error("Desired < 0")
+					}
+					if replicaInfo.Current < 0 {
+						t.Error("Current < 0")
+					}
+					if replicaInfo.Ready < 0 {
+						t.Error("Ready < 0")
+					}
+					if replicaInfo.Available < 0 {
+						t.Error("Available < 0")
+					}
 				}
 			}
 		})
@@ -775,4 +773,3 @@ func TestKubectlGetPodsByLabel(t *testing.T) {
 		})
 	}
 }
-
--- a/api/internal/tools/talosctl.go
+++ b/api/internal/tools/talosctl.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"os/exec"
+	"regexp"
 	"runtime"
 	"strings"
 	"time"
@@ -385,6 +386,30 @@ func (t *Talosctl) Upgrade(nodeIP, image string, preserve bool) error {
 	return nil
 }

+// Reboot reboots a node. The node restarts without wiping state.
+func (t *Talosctl) Reboot(nodeIP string) error {
+	args := t.buildArgs([]string{
+		"reboot",
+		"--nodes", nodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		outputStr := string(output)
+		// Connection errors are expected — the node is rebooting
+		if strings.Contains(outputStr, "connection refused") || strings.Contains(outputStr, "Unavailable") || strings.Contains(outputStr, "EOF") {
+			return nil
+		}
+		return fmt.Errorf("talosctl reboot failed: %w\nOutput: %s", err, outputStr)
+	}
+
+	return nil
+}
+
 // Rollback reverts a node to its previous Talos version.
 // Talos uses an A/B image scheme, so rollback restores the previous boot image.
 func (t *Talosctl) Rollback(nodeIP string) error {
@@ -405,6 +430,104 @@ func (t *Talosctl) Rollback(nodeIP string) error {
 	return nil
 }

+// Reset resets a node to maintenance mode, wiping state and rebooting.
+// The node will reboot into maintenance mode after the reset.
+func (t *Talosctl) Reset(nodeIP string) error {
+	args := t.buildArgs([]string{
+		"reset",
+		"--nodes", nodeIP,
+		"--graceful=false",
+		"--reboot",
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		outputStr := string(output)
+		// Connection errors are expected — the node is rebooting after reset
+		if strings.Contains(outputStr, "connection refused") ||
+			strings.Contains(outputStr, "Unavailable") ||
+			strings.Contains(outputStr, "EOF") {
+			return nil
+		}
+		return fmt.Errorf("talosctl reset failed: %w\nOutput: %s", err, outputStr)
+	}
+
+	return nil
+}
+
+// EtcdRemoveMember removes an etcd member by hostname, executed from a healthy node.
+// It first looks up the member ID from the etcd member list, then removes by ID.
+func (t *Talosctl) EtcdRemoveMember(fromNodeIP, memberHostname string) error {
+	// Step 1: Get member list to find the member ID
+	memberID, err := t.etcdFindMemberID(fromNodeIP, memberHostname)
+	if err != nil {
+		return fmt.Errorf("failed to find etcd member ID for %s: %w", memberHostname, err)
+	}
+	if memberID == "" {
+		// Member not in the list — already removed
+		return nil
+	}
+
+	// Step 2: Remove by member ID
+	args := t.buildArgs([]string{
+		"etcd", "remove-member",
+		"--nodes", fromNodeIP,
+		memberID,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		outputStr := string(output)
+		if strings.Contains(outputStr, "not found") || strings.Contains(outputStr, "member not found") {
+			return nil
+		}
+		return fmt.Errorf("talosctl etcd remove-member failed: %w\nOutput: %s", err, outputStr)
+	}
+
+	return nil
+}
+
+// etcdFindMemberID looks up an etcd member's ID by hostname from the member list.
+// Returns empty string if the member is not found.
+func (t *Talosctl) etcdFindMemberID(fromNodeIP, hostname string) (string, error) {
+	args := t.buildArgs([]string{
+		"etcd", "members",
+		"--nodes", fromNodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("talosctl etcd members failed: %w\nOutput: %s", err, string(output))
+	}
+
+	// Parse tabular output:
+	// NODE           ID                 HOSTNAME         PEER URLS                   CLIENT URLS                 LEARNER
+	// 192.168.8.33   10996c6c8373517e   test-control-3   https://192.168.8.33:2380   https://192.168.8.33:2379   false
+	for _, line := range strings.Split(string(output), "\n") {
+		fields := strings.Fields(line)
+		if len(fields) < 3 || fields[0] == "NODE" {
+			continue
+		}
+		if fields[2] == hostname {
+			return fields[1], nil
+		}
+	}
+
+	return "", nil
+}
+
 // Validate checks if talosctl is available
 func (t *Talosctl) Validate() error {
 	cmd := exec.Command("talosctl", "version", "--client")
@@ -462,3 +585,172 @@ func GetClientInfo() (*ClientInfo, error) {
 		Arch:    arch,
 	}, nil
 }
+
+// ServiceStatus represents the health status of a Talos service
+type ServiceStatus struct {
+	ID            string `json:"id"`
+	State         string `json:"state"`
+	Healthy       bool   `json:"healthy"`
+	HealthMessage string `json:"healthMessage"`
+}
+
+// DmesgError represents a critical error found in kernel messages
+type DmesgError struct {
+	Severity  string `json:"severity"`
+	Message   string `json:"message"`
+	Timestamp string `json:"timestamp"`
+}
+
+// GetServices queries Talos service statuses from a node
+func (t *Talosctl) GetServices(nodeIP string) ([]ServiceStatus, error) {
+	args := t.buildArgs([]string{
+		"service",
+		"--nodes", nodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("talosctl service failed: %w\nOutput: %s", err, string(output))
+	}
+
+	return ParseServiceOutput(string(output)), nil
+}
+
+// ParseServiceOutput parses the tabular output of `talosctl service`
+func ParseServiceOutput(output string) []ServiceStatus {
+	var services []ServiceStatus
+	lines := strings.Split(output, "\n")
+
+	for _, line := range lines {
+		// Skip header and empty lines
+		if line == "" || strings.HasPrefix(line, "NODE") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		// Format: NODE SERVICE STATE HEALTH LAST_CHANGE LAST_CHANGE_UNIT LAST_EVENT...
+		// Minimum: node + service + state + health + last_change + unit = 6 fields
+		if len(fields) < 6 {
+			continue
+		}
+
+		id := fields[1]
+		state := fields[2]
+		health := fields[3]
+
+		// Extract last event (everything after the time fields)
+		// Fields 4 and 5 are "Xm Ys ago", event starts at field 6+
+		var healthMessage string
+		if len(fields) > 6 {
+			healthMessage = strings.Join(fields[6:], " ")
+		}
+
+		services = append(services, ServiceStatus{
+			ID:            id,
+			State:         state,
+			Healthy:       health == "OK",
+			HealthMessage: healthMessage,
+		})
+	}
+
+	return services
+}
+
+// GetDmesg retrieves kernel messages from a node
+func (t *Talosctl) GetDmesg(nodeIP string) (string, error) {
+	args := t.buildArgs([]string{
+		"dmesg",
+		"--nodes", nodeIP,
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "talosctl", args...)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("talosctl dmesg failed: %w\nOutput: %s", err, string(output))
+	}
+
+	return string(output), nil
+}
+
+// dmesg error patterns for hardware issues
+var dmesgErrorPatterns = []string{
+	"I/O error",
+	"Medium Error",
+	"failed command:",
+	"auto reallocate failed",
+	"memory error",
+	"machine check",
+	"ECC error",
+}
+
+var ataErrorRegex = regexp.MustCompile(`(?i)ata\d+.*error`)
+
+// ParseDmesgErrors scans dmesg output for critical hardware errors
+func ParseDmesgErrors(raw string) []DmesgError {
+	var errors []DmesgError
+	seen := make(map[string]bool)
+
+	for _, line := range strings.Split(raw, "\n") {
+		if line == "" {
+			continue
+		}
+
+		matched := false
+		lower := strings.ToLower(line)
+		for _, pattern := range dmesgErrorPatterns {
+			if strings.Contains(lower, strings.ToLower(pattern)) {
+				matched = true
+				break
+			}
+		}
+		if !matched && ataErrorRegex.MatchString(line) {
+			matched = true
+		}
+		if !matched {
+			continue
+		}
+
+		// Extract timestamp: format is "IP: facility: level: [TIMESTAMP]: message"
+		timestamp, message := parseDmesgLine(line)
+
+		// Deduplicate identical messages
+		if seen[message] {
+			continue
+		}
+		seen[message] = true
+
+		errors = append(errors, DmesgError{
+			Severity:  "error",
+			Message:   message,
+			Timestamp: timestamp,
+		})
+	}
+
+	return errors
+}
+
+// parseDmesgLine extracts timestamp and message from a talosctl dmesg line
+// Format: "192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb..."
+func parseDmesgLine(line string) (timestamp, message string) {
+	// Find timestamp in brackets
+	start := strings.Index(line, "[")
+	end := strings.Index(line, "]:")
+	if start >= 0 && end > start {
+		timestamp = line[start+1 : end]
+		// Message is everything after "]: "
+		if end+2 < len(line) {
+			message = strings.TrimSpace(line[end+2:])
+		}
+	}
+	if message == "" {
+		message = line
+	}
+	return
+}
--- a/api/internal/tools/talosctl_test.go
+++ b/api/internal/tools/talosctl_test.go
@@ -11,8 +11,7 @@ func TestNewTalosctl(t *testing.T) {
 		tc := NewTalosctl()
 		if tc == nil {
 			t.Fatal("NewTalosctl() returned nil")
-		}
-		if tc.talosconfigPath != "" {
+		} else if tc.talosconfigPath != "" {
 			t.Error("talosconfigPath should be empty for NewTalosctl()")
 		}
 	})
@@ -22,8 +21,7 @@ func TestNewTalosctl(t *testing.T) {
 		tc := NewTalosconfigWithConfig(configPath)
 		if tc == nil {
 			t.Fatal("NewTalosconfigWithConfig() returned nil")
-		}
-		if tc.talosconfigPath != configPath {
+		} else if tc.talosconfigPath != configPath {
 			t.Errorf("talosconfigPath = %q, want %q", tc.talosconfigPath, configPath)
 		}
 	})
@@ -433,9 +431,9 @@ Server:
 			want: "v1.11.5",
 		},
 		{
-			name: "fallback to Talos line when no Tag present",
+			name:   "fallback to Talos line when no Tag present",
 			output: `Talos v1.12.0`,
-			want: "v1.12.0",
+			want:   "v1.12.0",
 		},
 	}

@@ -449,6 +447,74 @@ Server:
 	}
 }

+func TestTalosconfigReset(t *testing.T) {
+	t.Run("builds correct args with talosconfig", func(t *testing.T) {
+		tc := &Talosctl{talosconfigPath: "/path/to/talosconfig"}
+		args := tc.buildArgs([]string{
+			"reset",
+			"--nodes", "192.168.1.100",
+			"--graceful=false",
+			"--reboot",
+		})
+
+		// Should have talosconfig prepended
+		if len(args) != 7 {
+			t.Fatalf("expected 7 args, got %d: %v", len(args), args)
+		}
+		if args[0] != "--talosconfig" || args[1] != "/path/to/talosconfig" {
+			t.Error("expected --talosconfig prefix")
+		}
+		if args[2] != "reset" {
+			t.Errorf("args[2] = %q, want 'reset'", args[2])
+		}
+		if args[5] != "--graceful=false" {
+			t.Errorf("args[5] = %q, want '--graceful=false'", args[5])
+		}
+		if args[6] != "--reboot" {
+			t.Errorf("args[6] = %q, want '--reboot'", args[6])
+		}
+	})
+}
+
+func TestEtcdFindMemberID(t *testing.T) {
+	t.Run("builds correct args for etcd members", func(t *testing.T) {
+		tc := &Talosctl{talosconfigPath: "/path/to/talosconfig"}
+		args := tc.buildArgs([]string{
+			"etcd", "members",
+			"--nodes", "192.168.1.101",
+		})
+
+		if len(args) != 6 {
+			t.Fatalf("expected 6 args, got %d: %v", len(args), args)
+		}
+		if args[0] != "--talosconfig" || args[1] != "/path/to/talosconfig" {
+			t.Error("expected --talosconfig prefix")
+		}
+		if args[2] != "etcd" || args[3] != "members" {
+			t.Errorf("expected 'etcd members', got %q %q", args[2], args[3])
+		}
+	})
+
+	t.Run("builds correct args for etcd remove-member", func(t *testing.T) {
+		tc := &Talosctl{talosconfigPath: "/path/to/talosconfig"}
+		args := tc.buildArgs([]string{
+			"etcd", "remove-member",
+			"--nodes", "192.168.1.101",
+			"f742041aecc26912",
+		})
+
+		if len(args) != 7 {
+			t.Fatalf("expected 7 args, got %d: %v", len(args), args)
+		}
+		if args[2] != "etcd" || args[3] != "remove-member" {
+			t.Errorf("expected 'etcd remove-member', got %q %q", args[2], args[3])
+		}
+		if args[6] != "f742041aecc26912" {
+			t.Errorf("args[6] = %q, want member ID 'f742041aecc26912'", args[6])
+		}
+	})
+}
+
 func TestTalosconfigValidate(t *testing.T) {
 	t.Run("validate checks for talosctl", func(t *testing.T) {
 		tc := NewTalosctl()
@@ -619,6 +685,183 @@ func TestGetClientInfo(t *testing.T) {
 	}
 }

+func TestParseServiceOutput(t *testing.T) {
+	tests := []struct {
+		name     string
+		output   string
+		wantLen  int
+		checkSvc func(t *testing.T, services []ServiceStatus)
+	}{
+		{
+			name: "healthy node",
+			output: `NODE           SERVICE      STATE     HEALTH   LAST CHANGE      LAST EVENT
+192.168.8.33   apid         Running   OK       172h15m25s ago   Health check successful
+192.168.8.33   etcd         Running   OK       172h14m56s ago   Health check successful
+192.168.8.33   kubelet      Running   OK       172h15m16s ago   Health check successful`,
+			wantLen: 3,
+			checkSvc: func(t *testing.T, services []ServiceStatus) {
+				for _, svc := range services {
+					if !svc.Healthy {
+						t.Errorf("service %s should be healthy", svc.ID)
+					}
+					if svc.State != "Running" {
+						t.Errorf("service %s state = %q, want Running", svc.ID, svc.State)
+					}
+				}
+			},
+		},
+		{
+			name: "unhealthy etcd",
+			output: `NODE           SERVICE      STATE     HEALTH   LAST CHANGE   LAST EVENT
+192.168.8.32   etcd         Running   Fail     42m14s ago    Health check failed: context deadline exceeded
+192.168.8.32   kubelet      Running   OK       37m42s ago    Health check successful`,
+			wantLen: 2,
+			checkSvc: func(t *testing.T, services []ServiceStatus) {
+				for _, svc := range services {
+					if svc.ID == "etcd" {
+						if svc.Healthy {
+							t.Error("etcd should be unhealthy")
+						}
+						if svc.HealthMessage != "Health check failed: context deadline exceeded" {
+							t.Errorf("etcd health message = %q", svc.HealthMessage)
+						}
+					}
+					if svc.ID == "kubelet" && !svc.Healthy {
+						t.Error("kubelet should be healthy")
+					}
+				}
+			},
+		},
+		{
+			name: "services with unknown health",
+			output: `NODE           SERVICE      STATE     HEALTH   LAST CHANGE      LAST EVENT
+192.168.8.32   dashboard    Running   ?        42m47s ago       Process Process(["/sbin/dashboard"]) started with PID 2237`,
+			wantLen: 1,
+			checkSvc: func(t *testing.T, services []ServiceStatus) {
+				if services[0].Healthy {
+					t.Error("service with ? health should not be marked healthy")
+				}
+			},
+		},
+		{
+			name:    "empty output",
+			output:  "",
+			wantLen: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			services := ParseServiceOutput(tt.output)
+			if len(services) != tt.wantLen {
+				t.Errorf("ParseServiceOutput() returned %d services, want %d", len(services), tt.wantLen)
+				return
+			}
+			if tt.checkSvc != nil {
+				tt.checkSvc(t, services)
+			}
+		})
+	}
+}
+
+func TestParseDmesgErrors(t *testing.T) {
+	tests := []struct {
+		name    string
+		input   string
+		wantLen int
+		check   func(t *testing.T, errors []DmesgError)
+	}{
+		{
+			name: "disk I/O errors",
+			input: `192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848
+192.168.8.32: kern:    info: [2026-05-25T07:12:06.040Z]: sd 1:0:0:0: [sdb] Sense Key : Medium Error [current]
+192.168.8.32: kern:     err: [2026-05-25T07:12:10.886Z]: ata1.00: failed command: READ FPDMA QUEUED
+192.168.8.32: kern:    info: [2026-05-25T07:12:14.072Z]: sd 1:0:0:0: Add. Sense: Unrecovered read error - auto reallocate failed`,
+			wantLen: 4,
+			check: func(t *testing.T, errors []DmesgError) {
+				if errors[0].Timestamp != "2026-05-25T07:12:06.034Z" {
+					t.Errorf("timestamp = %q", errors[0].Timestamp)
+				}
+				if errors[0].Severity != "error" {
+					t.Errorf("severity = %q, want error", errors[0].Severity)
+				}
+			},
+		},
+		{
+			name: "ata error pattern",
+			input: `192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: ata1.00: error: { UNC }`,
+			wantLen: 1,
+		},
+		{
+			name: "no errors in normal output",
+			input: `192.168.8.32: kern:    info: [2026-05-25T07:11:00.000Z]: Linux version 6.18.24-talos
+192.168.8.32: kern:    info: [2026-05-25T07:11:00.100Z]: Command line: init_on_alloc=1
+192.168.8.32: kern:    info: [2026-05-25T07:11:01.000Z]: sdb: sdb1 sdb2 sdb3 sdb4`,
+			wantLen: 0,
+		},
+		{
+			name:    "empty input",
+			input:   "",
+			wantLen: 0,
+		},
+		{
+			name: "deduplicates identical messages",
+			input: `192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848
+192.168.8.32: kern:     err: [2026-05-25T07:12:10.034Z]: I/O error, dev sdb, sector 4873848`,
+			wantLen: 1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			errors := ParseDmesgErrors(tt.input)
+			if len(errors) != tt.wantLen {
+				t.Errorf("ParseDmesgErrors() returned %d errors, want %d", len(errors), tt.wantLen)
+				for _, e := range errors {
+					t.Logf("  error: %s", e.Message)
+				}
+				return
+			}
+			if tt.check != nil {
+				tt.check(t, errors)
+			}
+		})
+	}
+}
+
+func TestParseDmesgLine(t *testing.T) {
+	tests := []struct {
+		name          string
+		line          string
+		wantTimestamp string
+		wantMessage   string
+	}{
+		{
+			name:          "standard talos dmesg format",
+			line:          "192.168.8.32: kern:     err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848",
+			wantTimestamp: "2026-05-25T07:12:06.034Z",
+			wantMessage:   "I/O error, dev sdb, sector 4873848",
+		},
+		{
+			name:        "line without brackets",
+			line:        "some plain log line",
+			wantMessage: "some plain log line",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ts, msg := parseDmesgLine(tt.line)
+			if ts != tt.wantTimestamp {
+				t.Errorf("timestamp = %q, want %q", ts, tt.wantTimestamp)
+			}
+			if msg != tt.wantMessage {
+				t.Errorf("message = %q, want %q", msg, tt.wantMessage)
+			}
+		})
+	}
+}
+
 // Helper function for interface filtering test
 func containsAny(s string, substrs []string) bool {
 	for _, substr := range substrs {
--- a/api/internal/tools/yq_test.go
+++ b/api/internal/tools/yq_test.go
@@ -12,8 +12,7 @@ func TestNewYQ(t *testing.T) {
 		yq := NewYQ()
 		if yq == nil {
 			t.Fatal("NewYQ() returned nil")
-		}
-		if yq.yqPath == "" {
+		} else if yq.yqPath == "" {
 			t.Error("yqPath should not be empty")
 		}
 	})
--- a/api/main.go
+++ b/api/main.go
@@ -2,7 +2,7 @@ package main

 import (
 	"fmt"
-	"log"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/signal"
@@ -16,6 +16,8 @@ import (
 	v1 "github.com/wild-cloud/wild-central/daemon/internal/api/v1"
 	"github.com/wild-cloud/wild-central/daemon/internal/backup"
 	"github.com/wild-cloud/wild-central/daemon/internal/instance"
+	"github.com/wild-cloud/wild-central/daemon/internal/logging"
+	"github.com/wild-cloud/wild-central/daemon/internal/operations"
 )

 var startTime time.Time
@@ -33,6 +35,11 @@ func splitAndTrim(s string, sep string) []string {
 }

 func main() {
+	// Initialize structured logging
+	slog.SetDefault(slog.New(logging.NewConsoleHandler(os.Stderr, &slog.HandlerOptions{
+		Level: slog.LevelInfo,
+	})))
+
 	// Record start time
 	startTime = time.Now()

@@ -45,27 +52,31 @@ func main() {
 	// Get apps directory from environment or use default
 	appsDir := os.Getenv("WILD_DIRECTORY")
 	if appsDir == "" {
-		// Default apps directory
 		appsDir = "/opt/wild-cloud/apps"
-		log.Printf("WILD_DIRECTORY not set, using default apps directory: %s", appsDir)
-	} else {
-		// If WILD_DIRECTORY is set, use it as-is for backward compatibility
-		// (it might point to the old directory structure with apps/ subdirectory)
-		log.Printf("Using WILD_DIRECTORY for apps: %s", appsDir)
 	}
+	slog.Info("configured directories", "dataDir", dataDir, "appsDir", appsDir)

 	// Create API handler with all dependencies
 	api, err := v1.NewAPI(dataDir, appsDir)
 	if err != nil {
-		log.Fatalf("Failed to initialize API: %v", err)
+		slog.Error("failed to initialize API", "error", err)
+		os.Exit(1)
+	}
+
+	// Fail any operations left running from a previous API process
+	instanceMgr := instance.NewManager(dataDir)
+	opsMgr := operations.NewManager(dataDir)
+	if instances, err := instanceMgr.ListInstances(); err == nil {
+		for _, name := range instances {
+			if err := opsMgr.FailOrphaned(name); err != nil {
+				slog.Warn("failed to clean orphaned operations", "instance", name, "error", err)
+			}
+		}
 	}

 	// Start central status SSE broadcaster
 	api.StartCentralStatusBroadcaster(startTime)
-	log.Println("Central status broadcaster started")
-
-	// Start backup scheduler
-	instanceMgr := instance.NewManager(dataDir)
+	slog.Info("central status broadcaster started")
 	scheduler := backup.NewScheduler(dataDir, instanceMgr)
 	scheduler.Start()

@@ -89,9 +100,8 @@ func main() {
 	var allowedOrigins []string

 	if corsOrigins := os.Getenv("WILD_CORS_ORIGINS"); corsOrigins != "" {
-		// Use explicitly configured origins
 		allowedOrigins = splitAndTrim(corsOrigins, ",")
-		log.Printf("CORS configured with explicit origins: %v", allowedOrigins)
+		slog.Info("CORS configured with explicit origins", "origins", allowedOrigins)
 	} else {
 		// Auto-detect origins based on hostname
 		allowedOrigins = []string{
@@ -116,7 +126,7 @@ func main() {
 				fmt.Sprintf("http://%s:5173", hostname),
 				fmt.Sprintf("http://%s:5174", hostname),
 			)
-			log.Printf("Added hostname-based CORS origins for: %s", hostname)
+			slog.Info("added hostname-based CORS origins", "hostname", hostname)
 		}

 		// Add development server ports
@@ -129,7 +139,7 @@ func main() {
 			"http://127.0.0.1:3000",
 		)

-		log.Printf("CORS configured with auto-detected origins: %v", allowedOrigins)
+		slog.Info("CORS configured with auto-detected origins", "count", len(allowedOrigins))
 	}

 	corsHandler := cors.New(cors.Options{
@@ -163,9 +173,7 @@ func main() {
 	port := 5055

 	addr := fmt.Sprintf("%s:%d", host, port)
-	log.Printf("Starting wild-central daemon on %s", addr)
-	log.Printf("Data directory: %s", dataDir)
-	log.Printf("Apps directory: %s", appsDir)
+	slog.Info("daemon started", "addr", addr)

 	// Set up signal handling for graceful shutdown
 	sigChan := make(chan os.Signal, 1)
@@ -174,13 +182,14 @@ func main() {
 	// Start HTTP server in goroutine
 	go func() {
 		if err := http.ListenAndServe(addr, handler); err != nil {
-			log.Fatal("Server failed to start:", err)
+			slog.Error("server failed to start", "error", err)
+			os.Exit(1)
 		}
 	}()

 	// Wait for shutdown signal
 	<-sigChan
-	log.Println("Shutting down gracefully...")
+	slog.Info("shutting down")
 	scheduler.Stop()
-	log.Println("Shutdown complete")
+	slog.Info("shutdown complete")
 }
--- a/api/test/e2e/lib.sh
+++ b/api/test/e2e/lib.sh
@@ -160,6 +160,19 @@ api_put() {
  rm -f "$tmpfile"
 }

+# Makes a PATCH request. Sets HTTP_CODE and RESP globals.
+api_patch() {
+  local path="$1"
+  local body="$2"
+  local tmpfile
+  tmpfile=$(mktemp)
+  HTTP_CODE=$(curl -s -w '%{http_code}' -o "$tmpfile" \
+    -X PATCH -H "Content-Type: application/json" \
+    -d "$body" "${API_URL}${path}")
+  RESP=$(cat "$tmpfile")
+  rm -f "$tmpfile"
+}
+
 # Makes a DELETE request. Sets HTTP_CODE and RESP globals.
 api_delete() {
  local path="$1"
--- a/api/test/e2e/tests/05-config-and-drift.sh
+++ b/api/test/e2e/tests/05-config-and-drift.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+# Test: Config changes and drift detection
+# Verifies: PATCH config, compilation drift detected, compile clears drift, deploy succeeds
+# Idempotent: restores original config at end
+# Note: Uses db.name (not storage) because PVC storage can only expand, never shrink
+
+APP_CONFIG_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+APP_ENHANCED_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
+DB_INIT_FILE="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/db-init-job.yaml"
+
+# --- Read current config and capture original db.name ---
+
+test_start "Config: Read current config"
+api_get "$APP_CONFIG_PATH"
+assert_http "200" "GET app config should return 200"
+
+ORIGINAL_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+if [[ -z "$ORIGINAL_DB_NAME" ]]; then
+  ORIGINAL_DB_NAME="e2e_test_app"
+fi
+
+# --- PATCH config: change db.name ---
+
+test_start "Config: PATCH db.name to e2e_drift_test"
+api_patch "$APP_CONFIG_PATH" '{"config":{"db":{"name":"e2e_drift_test"}}}'
+assert_http "200" "PATCH config should return 200"
+
+test_start "Config: Verify config changed"
+api_get "$APP_CONFIG_PATH"
+NEW_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+assert_eq "$NEW_DB_NAME" "e2e_drift_test" "db.name should be e2e_drift_test after PATCH"
+
+# --- Check drift: config changed but not recompiled ---
+
+test_start "Config: Drift detected after config change"
+api_get "$APP_ENHANCED_PATH"
+COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
+assert_eq "$COMP_DRIFTED" "true" "Compilation drift should be detected"
+
+# --- Compile to clear compilation drift ---
+
+test_start "Config: Compile clears compilation drift"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+assert_http "200" "Compile should return 200"
+
+test_start "Config: Verify no compilation drift after compile"
+api_get "$APP_ENHANCED_PATH"
+COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
+assert_eq "$COMP_DRIFTED" "false" "Compilation drift should be cleared after compile"
+
+# --- Verify compiled db-init-job.yaml has new db name ---
+
+test_start "Config: Compiled db-init-job.yaml has e2e_drift_test"
+if grep -q "e2e_drift_test" "$DB_INIT_FILE" 2>/dev/null; then
+  test_pass
+else
+  test_fail "db-init-job.yaml should contain e2e_drift_test after compile"
+fi
+
+# --- Cleanup: restore original db.name, recompile, deploy ---
+
+echo "  Restoring original db.name (${ORIGINAL_DB_NAME})..."
+api_patch "$APP_CONFIG_PATH" "{\"config\":{\"db\":{\"name\":\"${ORIGINAL_DB_NAME}\"}}}"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "Config: Deploy with restored config"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed when restoring original config"
+fi
+
+test_start "Config: Pods ready after restore"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after config restore deploy"
+fi
--- a/api/test/e2e/tests/06-fetch-and-update.sh
+++ b/api/test/e2e/tests/06-fetch-and-update.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Test: Fetch from Wild Directory and redeploy
+# Verifies: fetch re-copies package from source, deploy succeeds after fetch
+# Idempotent: leaves app in same state as before
+
+MANIFEST_PATH="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/manifest.yaml"
+PACKAGE_DIR="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/.package"
+
+# --- Record current version ---
+
+test_start "Fetch: Record current version"
+CURRENT_VERSION=$(grep '^version:' "$MANIFEST_PATH" 2>/dev/null | head -1 | awk '{print $2}')
+assert_not_empty "$CURRENT_VERSION" "Should have a current version in manifest"
+
+# --- Fetch from Wild Directory ---
+
+test_start "Fetch: Re-fetch from source"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/fetch"
+assert_http "200" "Fetch should return 200"
+
+# --- Verify .package directory exists (source backup) ---
+
+test_start "Fetch: .package directory exists after fetch"
+if [[ -d "$PACKAGE_DIR" ]]; then
+  test_pass
+else
+  test_fail ".package directory should exist after fetch"
+fi
+
+# --- Verify manifest version still present ---
+
+test_start "Fetch: Version preserved after fetch"
+AFTER_VERSION=$(grep '^version:' "$MANIFEST_PATH" 2>/dev/null | head -1 | awk '{print $2}')
+assert_not_empty "$AFTER_VERSION" "Version should still be present after fetch"
+
+# --- Check source drift is cleared ---
+
+test_start "Fetch: No source drift after fetch"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
+SRC_DRIFTED=$(echo "$RESP" | jq -r '.drift.source.drifted // false' 2>/dev/null)
+assert_eq "$SRC_DRIFTED" "false" "Source drift should be false after fresh fetch"
+
+# --- Deploy after fetch ---
+
+test_start "Fetch: Deploy after fetch"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed after fetch"
+fi
+
+test_start "Fetch: Pods ready after deploy"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after fetch+deploy"
+fi
+
+# --- Verify status OK ---
+
+test_start "Fetch: Status OK after fetch+deploy"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+assert_http "200" "Status should return 200 after fetch+deploy"
--- a/api/test/e2e/tests/07-app-dependencies.sh
+++ b/api/test/e2e/tests/07-app-dependencies.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Test: App dependencies — add with explicit requiredAppMappings, verify resolution
+# Verifies: dependency mappings resolve correctly, secrets from deps are present
+# Idempotent: deletes and re-adds app, leaves it deployed
+
+MANIFEST_PATH="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/manifest.yaml"
+
+# --- Delete existing app to test fresh add with mappings ---
+
+echo "  Deleting ${APP_NAME} to test dependency add..."
+start_async_delete_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}" "$DELETE_TIMEOUT" || true
+
+# Wait for namespace to fully terminate
+WAIT=0
+while (( WAIT < 60 )); do
+  NS_STATUS=$($KC get namespace "$APP_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
+  if [[ "$NS_STATUS" == "NotFound" ]]; then
+    break
+  fi
+  sleep 5
+  WAIT=$((WAIT + 5))
+done
+
+# --- Add with explicit dependency mapping ---
+
+test_start "Deps: Add app with requiredAppMappings"
+api_post "/api/v1/instances/${INSTANCE}/apps" \
+  "{\"name\":\"${APP_NAME}\",\"requiredAppMappings\":{\"postgres\":\"postgres\"}}"
+assert_http_one_of "200 201" "Add with mappings should succeed"
+
+# --- Verify manifest has installedAs ---
+
+test_start "Deps: Manifest has installedAs for postgres"
+if grep -q "installedAs: postgres" "$MANIFEST_PATH" 2>/dev/null; then
+  test_pass
+else
+  test_fail "manifest.yaml should have installedAs: postgres"
+fi
+
+# --- Verify config has db.host referencing postgres ---
+
+test_start "Deps: Config has db.host"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+DB_HOST=$(echo "$RESP" | jq -r '.db.host // empty' 2>/dev/null)
+assert_not_empty "$DB_HOST" "db.host should be set from postgres dependency"
+
+# --- Deploy ---
+
+test_start "Deps: Deploy app with dependencies"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed with dependency mappings"
+fi
+
+test_start "Deps: Pods ready"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after dep deploy"
+fi
+
+# --- Verify K8s secret has postgres.password from dependency ---
+
+test_start "Deps: K8s secret has postgres.password key"
+SECRET_KEYS=$($KC get secret "${APP_NAME}-secrets" -n "$APP_NAME" -o jsonpath='{.data}' 2>/dev/null)
+if echo "$SECRET_KEYS" | grep -q "postgres.password" 2>/dev/null; then
+  test_pass
+else
+  test_fail "K8s secret should contain postgres.password from dependency"
+fi
--- a/api/test/e2e/tests/08-secrets-rotation.sh
+++ b/api/test/e2e/tests/08-secrets-rotation.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Test: Secrets rotation — change a secret, redeploy, verify in cluster
+# Verifies: PUT secrets, redeploy applies new secret to K8s
+# Idempotent: restores original secret at end
+
+SECRETS_PATH="/api/v1/instances/${INSTANCE}/secrets"
+
+# --- Read current secrets ---
+
+test_start "Secrets: Read raw secrets"
+api_get "${SECRETS_PATH}?raw=true"
+assert_http "200" "GET raw secrets should return 200"
+
+ORIGINAL_SECRETS="$RESP"
+ORIGINAL_PASSWORD=$(echo "$RESP" | jq -r '.apps."e2e-test-app".dbPassword // empty' 2>/dev/null)
+
+test_start "Secrets: Has dbPassword for e2e-test-app"
+assert_not_empty "$ORIGINAL_PASSWORD" "Should have a dbPassword for e2e-test-app"
+
+# --- Generate and set new password ---
+
+NEW_PASSWORD="e2e-rotated-$(date +%s)"
+
+test_start "Secrets: Rotate dbPassword"
+# Build modified secrets document with jq
+MODIFIED_SECRETS=$(echo "$ORIGINAL_SECRETS" | jq --arg pw "$NEW_PASSWORD" \
+  '.apps."e2e-test-app".dbPassword = $pw')
+api_put "${SECRETS_PATH}" "$MODIFIED_SECRETS"
+assert_http "200" "PUT secrets should return 200"
+
+# --- Verify secret stored ---
+
+test_start "Secrets: Verify new password stored"
+api_get "${SECRETS_PATH}?raw=true"
+STORED_PASSWORD=$(echo "$RESP" | jq -r '.apps."e2e-test-app".dbPassword // empty' 2>/dev/null)
+assert_eq "$STORED_PASSWORD" "$NEW_PASSWORD" "Stored password should match rotated value"
+
+# --- Compile and deploy to push new secret to cluster ---
+
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "Secrets: Deploy after rotation"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed after secret rotation"
+fi
+
+wait_for_pods "$APP_NAME" 120 || true
+
+# --- Verify K8s secret updated ---
+
+test_start "Secrets: K8s secret has rotated password"
+K8S_PASSWORD=$($KC get secret "${APP_NAME}-secrets" -n "$APP_NAME" \
+  -o jsonpath='{.data.dbPassword}' 2>/dev/null | base64 -d 2>/dev/null)
+assert_eq "$K8S_PASSWORD" "$NEW_PASSWORD" "K8s secret should have the rotated password"
+
+# --- Cleanup: restore original secrets ---
+
+echo "  Restoring original secrets..."
+api_put "${SECRETS_PATH}" "$ORIGINAL_SECRETS"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "Secrets: Deploy with restored secrets"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed when restoring original secrets"
+fi
+
+wait_for_pods "$APP_NAME" 120 || true
--- a/api/test/e2e/tests/09-dep-config-propagation.sh
+++ b/api/test/e2e/tests/09-dep-config-propagation.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Test: Dependency config propagation — change db.name, recompile, verify in manifests
+# Verifies: config change propagates to compiled templates after compile
+# Idempotent: restores original config at end
+
+APP_CONFIG_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+
+# --- Read current db.name ---
+
+test_start "DepConfig: Read current db.name"
+api_get "$APP_CONFIG_PATH"
+assert_http "200" "GET app config should return 200"
+
+ORIGINAL_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+if [[ -z "$ORIGINAL_DB_NAME" ]]; then
+  ORIGINAL_DB_NAME="e2e_test_app"
+fi
+
+# --- PATCH db.name to a new value ---
+
+test_start "DepConfig: PATCH db.name"
+api_patch "$APP_CONFIG_PATH" '{"config":{"db":{"name":"e2e_test_app_v2"}}}'
+assert_http "200" "PATCH db.name should return 200"
+
+test_start "DepConfig: Verify db.name changed"
+api_get "$APP_CONFIG_PATH"
+NEW_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
+assert_eq "$NEW_DB_NAME" "e2e_test_app_v2" "db.name should be e2e_test_app_v2"
+
+# --- Compile ---
+
+test_start "DepConfig: Compile after config change"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+assert_http "200" "Compile should return 200"
+
+# --- Verify compiled db-init-job.yaml has new db name ---
+
+test_start "DepConfig: Compiled db-init-job.yaml has e2e_test_app_v2"
+DB_INIT_FILE="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/db-init-job.yaml"
+if grep -q "e2e_test_app_v2" "$DB_INIT_FILE" 2>/dev/null; then
+  test_pass
+else
+  test_fail "db-init-job.yaml should contain e2e_test_app_v2 after compile"
+fi
+
+# --- Check no compilation drift (we just compiled) ---
+
+test_start "DepConfig: No compilation drift after compile"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
+COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
+assert_eq "$COMP_DRIFTED" "false" "No compilation drift expected right after compile"
+
+# --- Cleanup: restore original db.name, recompile, deploy ---
+
+echo "  Restoring original db.name (${ORIGINAL_DB_NAME})..."
+api_patch "$APP_CONFIG_PATH" "{\"config\":{\"db\":{\"name\":\"${ORIGINAL_DB_NAME}\"}}}"
+api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
+
+test_start "DepConfig: Deploy with restored config"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed when restoring original config"
+fi
+
+wait_for_pods "$APP_NAME" 120 || true
--- a/api/test/e2e/tests/10-delete-and-readd.sh
+++ b/api/test/e2e/tests/10-delete-and-readd.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Test: Delete and re-add round trip
+# Verifies: full lifecycle — delete, verify gone, re-add, deploy, verify working
+# Idempotent: leaves app deployed for subsequent tests
+
+# --- Verify app is currently deployed ---
+
+test_start "DeleteReadd: App is deployed"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+assert_http "200" "App should be deployed before delete test"
+
+# --- Delete ---
+
+test_start "DeleteReadd: Delete app"
+if start_async_delete_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}" "$DELETE_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Delete failed"
+fi
+
+# --- Wait for namespace gone ---
+
+echo "  Waiting for namespace cleanup..."
+WAIT=0
+while (( WAIT < 60 )); do
+  NS_STATUS=$($KC get namespace "$APP_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
+  if [[ "$NS_STATUS" == "NotFound" ]]; then
+    break
+  fi
+  sleep 5
+  WAIT=$((WAIT + 5))
+done
+
+# --- Verify app is gone ---
+
+test_start "DeleteReadd: App gone after delete"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+if [[ "$HTTP_CODE" == "404" || "$HTTP_CODE" == "500" ]]; then
+  test_pass
+elif [[ "$HTTP_CODE" == "200" ]]; then
+  APP_STATUS=$(echo "$RESP" | jq -r '.status // empty' 2>/dev/null)
+  if [[ "$APP_STATUS" == "not-added" || "$APP_STATUS" == "not-deployed" ]]; then
+    test_pass
+  else
+    test_fail "App still appears as deployed after delete (status: ${APP_STATUS})"
+  fi
+else
+  test_fail "Unexpected HTTP ${HTTP_CODE}"
+fi
+
+# --- Re-add ---
+
+test_start "DeleteReadd: Re-add app"
+api_post "/api/v1/instances/${INSTANCE}/apps" "{\"name\":\"${APP_NAME}\"}"
+assert_http_one_of "200 201" "Re-add should succeed"
+
+# --- Verify config written ---
+
+test_start "DeleteReadd: Config exists after re-add"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
+APP_NAMESPACE=$(echo "$RESP" | jq -r '.namespace // empty' 2>/dev/null)
+assert_eq "$APP_NAMESPACE" "e2e-test-app" "Config namespace should be set after re-add"
+
+# --- Deploy ---
+
+test_start "DeleteReadd: Deploy after re-add"
+if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
+  test_pass
+else
+  test_fail "Deploy failed after re-add"
+fi
+
+test_start "DeleteReadd: Pods ready after re-add deploy"
+if wait_for_pods "$APP_NAME" 120; then
+  test_pass
+else
+  test_fail "Pods not ready after re-add deploy"
+fi
+
+# --- Verify status OK ---
+
+test_start "DeleteReadd: Status OK after re-add"
+api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
+assert_http "200" "Status should return 200 after re-add"
--- a/api/test/e2e/tests/11-edge-cases.sh
+++ b/api/test/e2e/tests/11-edge-cases.sh
--- a/api/test/e2e/tests/12-cli.sh
+++ b/api/test/e2e/tests/12-cli.sh
--- a/api/test/e2e/tests/13-cleanup.sh
+++ b/api/test/e2e/tests/13-cleanup.sh
--- a/api/test/e2e/tests/14-upgrade.sh
+++ b/api/test/e2e/tests/14-upgrade.sh
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -3,6 +3,7 @@ package cmd
 import (
 	"fmt"
 	"os"
+	"path/filepath"

 	"github.com/spf13/cobra"

@@ -143,14 +144,13 @@ Examples:
 		// If --persist flag is set, save to instance directory
 		if persist {
 			dataDir := config.GetWildCLIDataDir()
-			instanceDir := fmt.Sprintf("%s/instances/%s", dataDir, inst)
+			kubeconfigPath := config.GetKubeconfigPath(dataDir, inst)

 			// Create instance directory if it doesn't exist
-			if err := os.MkdirAll(instanceDir, 0755); err != nil {
+			if err := os.MkdirAll(filepath.Dir(kubeconfigPath), 0755); err != nil {
 				return fmt.Errorf("failed to create instance directory: %w", err)
 			}

-			kubeconfigPath := fmt.Sprintf("%s/kubeconfig", instanceDir)
 			if err := os.WriteFile(kubeconfigPath, []byte(kubeconfigContent), 0600); err != nil {
 				return fmt.Errorf("failed to write kubeconfig: %w", err)
 			}
@@ -208,14 +208,13 @@ var clusterTalosconfigCmd = &cobra.Command{
 		// If --persist flag is set, save to instance directory
 		if persist {
 			dataDir := config.GetWildCLIDataDir()
-			instanceDir := fmt.Sprintf("%s/instances/%s", dataDir, inst)
+			talosconfigPath := config.GetTalosconfigPath(dataDir, inst)

-			// Create instance directory if it doesn't exist
-			if err := os.MkdirAll(instanceDir, 0755); err != nil {
-				return fmt.Errorf("failed to create instance directory: %w", err)
+			// Create talos/generated directory if it doesn't exist
+			if err := os.MkdirAll(filepath.Dir(talosconfigPath), 0755); err != nil {
+				return fmt.Errorf("failed to create talosconfig directory: %w", err)
 			}

-			talosconfigPath := fmt.Sprintf("%s/talosconfig", instanceDir)
 			if err := os.WriteFile(talosconfigPath, []byte(talosconfigContent), 0600); err != nil {
 				return fmt.Errorf("failed to write talosconfig: %w", err)
 			}
--- a/cli/cmd/instance.go
+++ b/cli/cmd/instance.go
@@ -3,7 +3,6 @@ package cmd
 import (
 	"fmt"
 	"os"
-	"path/filepath"

 	"github.com/spf13/cobra"

@@ -210,13 +209,12 @@ The instance can still be overridden with the --instance flag.`,

 		// Check for config files and provide hint
 		dataDir := config.GetWildCLIDataDir()
-		instanceDir := filepath.Join(dataDir, "instances", instanceToSet)

 		var hasConfigs bool
-		if _, err := os.Stat(filepath.Join(instanceDir, "talosconfig")); err == nil {
+		if _, err := os.Stat(config.GetTalosconfigPath(dataDir, instanceToSet)); err == nil {
 			hasConfigs = true
 		}
-		if _, err := os.Stat(filepath.Join(instanceDir, "kubeconfig")); err == nil {
+		if _, err := os.Stat(config.GetKubeconfigPath(dataDir, instanceToSet)); err == nil {
 			hasConfigs = true
 		}

@@ -247,16 +245,15 @@ This will set environment variables for the current instance's talosconfig and k

 		// Check for talosconfig and kubeconfig files
 		dataDir := config.GetWildCLIDataDir()
-		instanceDir := filepath.Join(dataDir, "instances", inst)

 		// Check for talosconfig
-		talosconfigPath := filepath.Join(instanceDir, "talosconfig")
+		talosconfigPath := config.GetTalosconfigPath(dataDir, inst)
 		if _, err := os.Stat(talosconfigPath); err == nil {
 			fmt.Printf("export TALOSCONFIG=%s\n", talosconfigPath)
 		}

 		// Check for kubeconfig
-		kubeconfigPath := filepath.Join(instanceDir, "kubeconfig")
+		kubeconfigPath := config.GetKubeconfigPath(dataDir, inst)
 		if _, err := os.Stat(kubeconfigPath); err == nil {
 			fmt.Printf("export KUBECONFIG=%s\n", kubeconfigPath)
 		}
--- a/cli/cmd/node.go
+++ b/cli/cmd/node.go
@@ -544,6 +544,110 @@ var nodeDeleteCmd = &cobra.Command{
 	},
 }

+var nodeHealthCmd = &cobra.Command{
+	Use:   "health <hostname>",
+	Short: "Check node health",
+	Long: `Check the health of a node by querying Talos service statuses
+and scanning kernel messages for hardware errors.
+
+Examples:
+  wild node health control-1
+  wild node health worker-2`,
+	Args: cobra.ExactArgs(1),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		inst, err := getInstanceName()
+		if err != nil {
+			return err
+		}
+
+		resp, err := apiClient.Get(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/health", inst, args[0]))
+		if err != nil {
+			return err
+		}
+
+		// Print human-readable output
+		healthy, _ := resp.Data["healthy"].(bool)
+		if healthy {
+			fmt.Printf("Node: %s — HEALTHY\n", args[0])
+		} else {
+			fmt.Printf("Node: %s — UNHEALTHY\n", args[0])
+		}
+
+		// Print services
+		if services, ok := resp.Data["services"].([]interface{}); ok && len(services) > 0 {
+			fmt.Println("\nServices:")
+			for _, s := range services {
+				svc, ok := s.(map[string]interface{})
+				if !ok {
+					continue
+				}
+				id, _ := svc["id"].(string)
+				state, _ := svc["state"].(string)
+				svcHealthy, _ := svc["healthy"].(bool)
+				msg, _ := svc["healthMessage"].(string)
+
+				status := "OK"
+				if !svcHealthy && msg != "" {
+					status = "FAIL"
+				} else if !svcHealthy {
+					status = "?"
+				}
+				if msg != "" {
+					fmt.Printf("  %-14s %-10s %-6s %s\n", id, state, status, msg)
+				} else {
+					fmt.Printf("  %-14s %-10s %s\n", id, state, status)
+				}
+			}
+		}
+
+		// Print dmesg errors
+		if errors, ok := resp.Data["dmesgErrors"].([]interface{}); ok && len(errors) > 0 {
+			fmt.Printf("\nDmesg Errors (%d):\n", len(errors))
+			for _, e := range errors {
+				entry, ok := e.(map[string]interface{})
+				if !ok {
+					continue
+				}
+				ts, _ := entry["timestamp"].(string)
+				msg, _ := entry["message"].(string)
+				if ts != "" {
+					fmt.Printf("  [%s] %s\n", ts, msg)
+				} else {
+					fmt.Printf("  %s\n", msg)
+				}
+			}
+		}
+
+		return nil
+	},
+}
+
+var nodeRebootCmd = &cobra.Command{
+	Use:   "reboot <hostname>",
+	Short: "Reboot a node",
+	Long: `Reboot a node without wiping state. The node will restart and
+rejoin the cluster automatically. Running workloads on this node will be interrupted.
+
+Examples:
+  wild node reboot control-1
+  wild node reboot worker-2`,
+	Args: cobra.ExactArgs(1),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		inst, err := getInstanceName()
+		if err != nil {
+			return err
+		}
+
+		_, err = apiClient.Post(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/reboot", inst, args[0]), nil)
+		if err != nil {
+			return err
+		}
+
+		fmt.Printf("Reboot initiated for %s\n", args[0])
+		return nil
+	},
+}
+
 var nodeUpgradeCmd = &cobra.Command{
 	Use:   "upgrade <hostname> <version>",
 	Short: "Upgrade a node to a new Talos version",
@@ -615,6 +719,35 @@ Examples:
 	},
 }

+var nodeResetCmd = &cobra.Command{
+	Use:   "reset <hostname>",
+	Short: "Reset a node to maintenance mode",
+	Long: `Reset a node to maintenance mode. This wipes the node's state and reboots
+it into maintenance mode so it can be reconfigured and rejoined to the cluster.
+
+For control plane nodes, the etcd member is removed from the cluster first
+via a healthy peer. The node remains in the configuration for reconfiguration.
+
+Examples:
+  wild node reset control-2
+  wild node reset worker-1`,
+	Args: cobra.ExactArgs(1),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		inst, err := getInstanceName()
+		if err != nil {
+			return err
+		}
+
+		_, err = apiClient.Post(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/reset", inst, args[0]), nil)
+		if err != nil {
+			return err
+		}
+
+		fmt.Printf("Reset initiated for %s — node will reboot to maintenance mode\n", args[0])
+		return nil
+	},
+}
+
 func init() {
 	nodeCmd.AddCommand(nodeDiscoverCmd)
 	nodeCmd.AddCommand(nodeCancelDiscoveryCmd)
@@ -626,6 +759,9 @@ func init() {
 	nodeCmd.AddCommand(nodeUpdateCmd)
 	nodeCmd.AddCommand(nodeFetchTemplatesCmd)
 	nodeCmd.AddCommand(nodeDeleteCmd)
+	nodeCmd.AddCommand(nodeHealthCmd)
+	nodeCmd.AddCommand(nodeRebootCmd)
+	nodeCmd.AddCommand(nodeResetCmd)
 	nodeCmd.AddCommand(nodeUpgradeCmd)
 	nodeCmd.AddCommand(nodeRollbackCmd)

--- a/cli/internal/config/instance.go
+++ b/cli/internal/config/instance.go
@@ -77,6 +77,21 @@ func SetCurrentInstance(instance string) error {
 	return nil
 }

+// GetInstanceDir returns the path to an instance's data directory
+func GetInstanceDir(dataDir, instanceName string) string {
+	return filepath.Join(dataDir, "instances", instanceName)
+}
+
+// GetTalosconfigPath returns the path to an instance's talosconfig
+func GetTalosconfigPath(dataDir, instanceName string) string {
+	return filepath.Join(dataDir, "instances", instanceName, "talos", "generated", "talosconfig")
+}
+
+// GetKubeconfigPath returns the path to an instance's kubeconfig
+func GetKubeconfigPath(dataDir, instanceName string) string {
+	return filepath.Join(dataDir, "instances", instanceName, "kubeconfig")
+}
+
 // InstanceLister is an interface for listing instances (allows for testing and dependency injection)
 type InstanceLister interface {
 	ListInstances() ([]string, error)
--- a/docs/backup-implementation.md
+++ b/docs/backup-implementation.md
@@ -1,444 +0,0 @@
-# Blue-Green Backup-Restore: Implementation Plan
-
-This document describes all changes needed to move from the current backup/restore
-system to the new blue-green design with RecoveryPlan coordination.
-
-See `api/internal/backup/README.md` for the algorithm and design.
-
-## Current State Summary
-
-**What exists today:**
- Strategy pattern with four strategies: config, postgres, mysql, longhorn-native
- Strategy interface: `Backup()`, `Restore()`, `Verify()` — no Switch or Cleanup
- Blue-green restore via metadata injection (`component.Metadata["blueGreen"] = true`)
- Restore deploys to `{app}-restore` namespace via string manipulation in `deployToRestoreNamespace()`
- Database names get `_restore` suffix via regex replacement
- No RecoveryPlan — state is flat `BackupInfo` + `ComponentBackup` records
- No switch or cleanup phases — restore is fire-and-forget
- No `activeDeployment` tracking — app status determined by checking if namespace exists
- Namespace is always `== appName` (hardcoded 1:1 relationship)
-
-**What needs to change:**
- Strategy interface gains `Switch()` and `Cleanup()` methods
- New `RecoveryPlan` type replaces metadata injection for phase coordination
- Color-based naming (`-blue`/`-green`) replaces `_restore`/`_old` suffixes
- `activeDeployment` field added to config.yaml per app
- New API endpoints for switch, cleanup, and plan inspection
- New CLI commands: `wild app switch`, `wild app cleanup`, `wild app recovery-plan`
- Webapp gains recovery plan UI with phase progression
-
-## Change Inventory
-
-### 1. Types — `api/internal/backup/types/types.go`
-
-**Add RecoveryPlan type:**
-```go
-type RecoveryPlan struct {
-    App          string               `yaml:"app"          json:"app"`
-    Instance     string               `yaml:"instance"     json:"instance"`
-    Status       string               `yaml:"status"       json:"status"`
-    Error        string               `yaml:"error"        json:"error,omitempty"`
-    Source       RecoverySource       `yaml:"source"       json:"source"`
-    StandbyColor string               `yaml:"standbyColor" json:"standbyColor"`
-    Standby      RecoveryStandby      `yaml:"standby"      json:"standby,omitempty"`
-    Strategies   []StrategyEntry      `yaml:"strategies"   json:"strategies"`
-    Phases       map[string]PhaseTime `yaml:"phases"       json:"phases"`
-}
-
-type RecoverySource struct {
-    ActiveColor string `yaml:"activeColor" json:"activeColor"`
-    Namespace   string `yaml:"namespace"   json:"namespace"`
-    AppDir      string `yaml:"appDir"      json:"appDir"`
-    ConfigPath  string `yaml:"configPath"  json:"configPath"`
-    SecretsPath string `yaml:"secretsPath" json:"secretsPath"`
-}
-
-type RecoveryStandby struct {
-    Namespace string `yaml:"namespace" json:"namespace"`
-    AppDir    string `yaml:"appDir"    json:"appDir"`
-}
-
-type StrategyEntry struct {
-    Name    string                 `yaml:"name"              json:"name"`
-    Status  string                 `yaml:"status"            json:"status"`
-    Params  map[string]interface{} `yaml:"params,omitempty"  json:"params,omitempty"`
-    Backup  map[string]interface{} `yaml:"backup,omitempty"  json:"backup,omitempty"`
-    Restore map[string]interface{} `yaml:"restore,omitempty" json:"restore,omitempty"`
-    Switch  map[string]interface{} `yaml:"switch,omitempty"  json:"switch,omitempty"`
-}
-
-type PhaseTime struct {
-    StartedAt   *time.Time `yaml:"startedAt,omitempty"   json:"startedAt,omitempty"`
-    CompletedAt *time.Time `yaml:"completedAt,omitempty" json:"completedAt,omitempty"`
-}
-```
-
-**Update Strategy interface:**
-```go
-type Strategy interface {
-    Name() string
-    Backup(plan *RecoveryPlan, manifest *apps.AppManifest, dest BackupDestination) error
-    Restore(plan *RecoveryPlan, dest BackupDestination) error
-    Switch(plan *RecoveryPlan) error
-    Cleanup(plan *RecoveryPlan) error
-    Verify(plan *RecoveryPlan, dest BackupDestination) error
-}
-```
-
-The old `Backup()` returned `*ComponentBackup` — now strategies write directly to
-their `StrategyEntry` in the plan. The old `Restore()` accepted `*ComponentBackup` —
-now strategies read their entry from the plan.
-
-**Remove or deprecate:**
- `RestoreOptions.BlueGreen` field — blue-green is now always the approach
- Metadata injection of `blueGreen` flag — replaced by plan-driven coordination
-
-### 2. Backup Manager — `api/internal/backup/backup.go`
-
-**Replace `BackupApp()` method:**
-
-Current: detects strategies, calls `strategy.Backup()`, collects `ComponentBackup` list.
-New: creates `RecoveryPlan`, reads `activeDeployment` from config.yaml, calls
-`strategy.Backup(plan, ...)` where each strategy writes to its own plan entry.
-
-Key changes:
- Read `config.yaml` → `apps.{app}.activeDeployment` (default: `"blue"`)
- Compute `standbyColor` = opposite of active
- Create plan with `source.activeColor`, `source.namespace`, `source.appDir`
- Pass plan to each strategy instead of collecting ComponentBackup returns
- Persist plan as YAML to destination and locally
-
-**Replace `RestoreApp()` method:**
-
-Current: loads BackupInfo, injects `blueGreen` metadata, calls `strategy.Restore()`,
-then calls `deployToRestoreNamespace()`.
-New: loads RecoveryPlan, computes standby targets, calls `strategy.Restore(plan, ...)`,
-then deploys to standby namespace using plan values.
-
-Key changes:
- Compute `plan.standby.namespace = {app}-{standbyColor}`
- Compute `plan.standby.appDir = apps/{app}-{standbyColor}`
- Each strategy computes its standby targets (colored DB names, volume names)
- Deploy step reads all rewrite values from the plan, not ad-hoc
-
-**Add `SwitchApp()` method (new):**
- Load plan, validate `plan.status == restored`
- Verify standby namespace has healthy pods
- Call `strategy.Switch(plan)` for each strategy
- Update `config.yaml`: set `apps.{app}.activeDeployment = {standbyColor}`
- Update ingress to route to standby namespace
- Set `plan.status = switched`
-
-**Add `CleanupApp()` method (new):**
- Load plan, validate `plan.status == switched`
- Call `strategy.Cleanup(plan)` for each strategy
- Delete previous active namespace
- Remove previous active app directory
- Set `plan.status = cleaned_up`
-
-**Remove `deployToRestoreNamespace()`:**
-Replaced by plan-driven deployment in the restore phase. The new version reads
-namespace, DB names, and volume names from the plan instead of doing regex replacement.
-
-**Remove `updateDatabaseReferences()`:**
-The regex-based string replacement (lines 750-832) is replaced by plan-driven
-manifest rewriting. During restore, the standby app directory gets manifests
-compiled with standby-colored values from the plan.
-
-### 3. Strategy Implementations
-
-Each strategy gains `Switch()` and `Cleanup()` methods and changes its `Backup()`
-and `Restore()` signatures to work with the RecoveryPlan.
-
-#### `api/internal/backup/strategies/config.go`
-
-**Backup:** Same logic (tar app files + config/secrets sections), but writes
-location and size to `plan.strategies["config"].backup` instead of returning
-a ComponentBackup.
-
-**Restore:** Same logic (extract, merge config/secrets), but reads from plan
-entry and writes to `plan.standby.appDir`.
-
-**Switch:** No-op. Config was already merged during restore.
-
-**Cleanup:** No-op. Config is shared, not per-color.
-
-#### `api/internal/backup/strategies/postgres.go`
-
-**Backup:** Same dump logic, but discovers and records params in plan:
-```yaml
-params:
-  podNamespace: postgres
-  podLabel: app=postgres
-  dbName: gitea
-  dbUser: gitea
-```
-
-**Restore:** Replace `_restore` suffix with `_{standbyColor}`. Read `dbName` from
-`plan.strategies["postgres"].params.dbName`, create `{dbName}_{standbyColor}`.
-Remove the `isBlueGreen` conditional — it's always blue-green now.
-
-**Switch:** Record `previousDbName = {dbName}_{activeColor}` in switch section.
-No rename needed — standby app already uses its own colored DB.
-
-**Cleanup:** Drop `{dbName}_{previousColor}` from the switch section.
-
-#### `api/internal/backup/strategies/mysql.go`
-
-Same pattern as postgres. Currently missing blue-green restore logic — this is
-a gap that gets fixed as part of this work.
-
-#### `api/internal/backup/strategies/longhorn_native.go`
-
-**Backup:** Same snapshot/backup logic, records volume params and backup URLs in plan.
-
-**Restore:** Replace `{pvcName}-restore-{timestamp}` naming with `{pvcName}-{standbyColor}`.
-Remove `isBlueGreen` conditional. Remove namespace creation (that's now the manager's job).
-
-**Switch:** Record previous active volume names in switch section.
-
-**Cleanup:** Delete previous active-colored Longhorn volumes listed in switch section.
-
-### 4. API Handlers — `api/internal/api/v1/handlers_backup.go`
-
-**Update existing handlers:**
- `BackupAppStart` → create RecoveryPlan instead of BackupInfo
- `BackupAppRestore` → remove `BlueGreen` option, always use plan-driven restore
- `BackupAppList` → return RecoveryPlans instead of BackupInfo list
- `BackupAppLatest` → return latest RecoveryPlan
-
-**Add new handlers:**
- `AppSwitch` — POST, calls `mgr.SwitchApp()`
- `AppCleanup` — POST, calls `mgr.CleanupApp()`
- `AppRecoveryPlan` — GET, returns current RecoveryPlan for an app
-
-### 5. API Routes — `api/internal/api/v1/handlers.go`
-
-**Add new routes:**
-```go
-r.HandleFunc("/api/v1/instances/{name}/apps/{app}/switch", api.AppSwitch).Methods("POST")
-r.HandleFunc("/api/v1/instances/{name}/apps/{app}/cleanup", api.AppCleanup).Methods("POST")
-r.HandleFunc("/api/v1/instances/{name}/apps/{app}/recovery-plan", api.AppRecoveryPlan).Methods("GET")
-```
-
-### 6. App Management — `api/internal/apps/apps.go`
-
-**`ListDeployed()` changes:**
- Currently: scans `apps/` directory, each subdirectory = one app, checks if namespace exists
- New: must handle colored directories (`apps/gitea-blue/`, `apps/gitea-green/`)
- Read `activeDeployment` from config.yaml to know which color is active
- Group colored directories under the same app name for display
- Return the active color's status as the app's status
- Indicate if a standby color exists (for UI display)
-
-**`Deploy()` changes:**
- Currently: namespace = appName, always
- New: namespace = `{appName}-{color}` when deploying via restore
- Normal `wild app deploy` continues to use bare namespace (= blue)
- The restore flow creates the standby-colored namespace via the plan
-
-**`GetStatus()` changes:**
- Read `activeDeployment` to know which namespace to check
- Return both active and standby status if both exist
-
-**`Delete()` changes:**
- Must handle deleting a colored namespace
- Should refuse to delete if a RecoveryPlan is in progress (restoring/switching)
-
-### 7. Config.yaml — `activeDeployment` field
-
-No schema change needed — config.yaml is a free-form YAML map. The backup system
-reads/writes `apps.{app}.activeDeployment` as a string field.
-
-**Backwards compatibility:**
- If `activeDeployment` is absent, treat as `"blue"`
- If app namespace has no color suffix, treat as blue
- First backup of an existing app records `activeColor: blue`
- First restore creates the green variant
-
-**Where it's read:**
- `backup.go` → `BackupApp()` to determine active color
- `apps.go` → `ListDeployed()` and `GetStatus()` to find active namespace
-
-**Where it's written:**
- `backup.go` → `SwitchApp()` sets it to the standby color
-
-### 8. CLI — `cli/cmd/backup.go`
-
-**Update existing commands:**
- `wild backup start <app>` → show RecoveryPlan summary after backup
- `wild restore <app>` → show standby color and plan status after restore
-
-**Add new commands:**
- `wild app switch <app>` → calls POST `/api/v1/instances/{}/apps/{}/switch`
- `wild app cleanup <app>` → calls POST `/api/v1/instances/{}/apps/{}/cleanup`
- `wild app recovery-plan <app>` → calls GET, displays plan YAML
-
-### 9. Web App
-
-#### API Client — `web/src/services/api/backups.ts`
-
-**Add RecoveryPlan types:**
-```typescript
-interface RecoveryPlan {
-  app: string;
-  instance: string;
-  status: string;
-  error?: string;
-  source: {
-    activeColor: string;
-    namespace: string;
-    appDir: string;
-  };
-  standbyColor: string;
-  standby?: {
-    namespace: string;
-    appDir: string;
-  };
-  strategies: StrategyEntry[];
-  phases: Record<string, { startedAt?: string; completedAt?: string }>;
-}
-
-interface StrategyEntry {
-  name: string;
-  status: string;
-  params?: Record<string, any>;
-  backup?: Record<string, any>;
-  restore?: Record<string, any>;
-  switch?: Record<string, any>;
-}
-```
-
-**Add API methods:**
-```typescript
-switchApp(instanceName: string, appName: string): Promise<void>
-cleanupApp(instanceName: string, appName: string): Promise<void>
-getRecoveryPlan(instanceName: string, appName: string): Promise<RecoveryPlan>
-```
-
-#### Backup Components — `web/src/components/backup/`
-
-**Update `BackupCard.tsx`:**
- Show RecoveryPlan status instead of flat BackupInfo
- Show active/standby color indicators
- Show phase progression (backed_up → restored → switched → cleaned_up)
- Add Switch and Cleanup action buttons (enabled based on plan status)
-
-**Update `BackupDetailsModal.tsx`:**
- Show per-strategy status from the plan
- Show params, backup locations, restore targets, switch records
- Render the plan as a readable phase timeline
-
-**New component: Recovery plan phase indicator**
- Visual progression through the four phases
- Per-strategy status within each phase
- Error display when a phase fails
-
-#### App Status Display
-
-**Wherever app status is shown:**
- If `activeDeployment` exists and a standby namespace exists, show both colors
- Indicate which color is active (serving traffic)
- Show if a recovery is in progress (plan status = restoring/switching)
-
-## Implementation Order
-
-The changes should be implemented in this order to keep the system working
-at each step:
-
-### Phase 1: Types and RecoveryPlan (foundation)
-1. Add `RecoveryPlan` and related types to `types/types.go`
-2. Add `Switch()` and `Cleanup()` to Strategy interface
-3. Add no-op `Switch()` and `Cleanup()` to all four strategy implementations
-4. Add RecoveryPlan read/write helpers to backup manager
-
-### Phase 2: Backup phase (create plans)
-5. Update `BackupApp()` to create a RecoveryPlan
-6. Update each strategy's `Backup()` to write to plan instead of returning ComponentBackup
-7. Persist plan alongside existing BackupInfo (dual-write for compatibility)
-8. Update `BackupAppStart` handler and route
-9. Tests: verify backup creates valid RecoveryPlan
-
-### Phase 3: Restore phase (use plans)
-10. Update `RestoreApp()` to load plan and compute standby targets
-11. Update each strategy's `Restore()` to read from plan
-12. Replace `deployToRestoreNamespace()` with plan-driven deployment
-13. Replace regex-based DB reference updates with plan-driven rewrites
-14. Update `BackupAppRestore` handler
-15. Tests: verify restore creates correct standby-colored resources
-
-### Phase 4: Switch and cleanup (new phases)
-16. Implement `SwitchApp()` in backup manager
-17. Implement `Switch()` in postgres and longhorn strategies
-18. Implement `CleanupApp()` in backup manager
-19. Implement `Cleanup()` in postgres and longhorn strategies
-20. Add `activeDeployment` read/write to config.yaml
-21. Add switch and cleanup API handlers and routes
-22. Tests: verify full lifecycle backup → restore → switch → cleanup
-
-### Phase 5: App management integration
-23. Update `ListDeployed()` to handle colored namespaces
-24. Update `GetStatus()` to read `activeDeployment`
-25. Update `Deploy()` to support colored namespace parameter
-26. Handle backwards compatibility (bare namespace = blue)
-
-### Phase 6: CLI
-27. Add `wild app switch` command
-28. Add `wild app cleanup` command
-29. Add `wild app recovery-plan` command
-30. Update `wild restore` output to show plan status
-
-### Phase 7: Web App
-31. Add RecoveryPlan types and API client methods
-32. Update BackupCard to show plan status and phase progression
-33. Update BackupDetailsModal with per-strategy plan view
-34. Add Switch and Cleanup action buttons
-35. Update app status display for dual-color awareness
-
-### Phase 8: Cleanup old code
-36. Remove `deployToRestoreNamespace()` and `updateDatabaseReferences()`
-37. Remove `RestoreOptions.BlueGreen` field
-38. Remove metadata injection of `blueGreen` flag
-39. Remove dual-write of BackupInfo (if plan fully replaces it)
-40. Update tests to remove old patterns
-
-## Files Changed
-
-### API
-| File | Change |
-|------|--------|
-| `api/internal/backup/types/types.go` | Add RecoveryPlan types, update Strategy interface |
-| `api/internal/backup/backup.go` | Rewrite BackupApp/RestoreApp, add SwitchApp/CleanupApp |
-| `api/internal/backup/strategies/config.go` | Update Backup/Restore signatures, add Switch/Cleanup |
-| `api/internal/backup/strategies/postgres.go` | Update Backup/Restore, implement Switch/Cleanup |
-| `api/internal/backup/strategies/mysql.go` | Update Backup/Restore, implement Switch/Cleanup |
-| `api/internal/backup/strategies/longhorn_native.go` | Update Backup/Restore, implement Switch/Cleanup |
-| `api/internal/backup/strategies/postgres_test.go` | Update tests for new signatures |
-| `api/internal/api/v1/handlers_backup.go` | Update handlers, add switch/cleanup/plan handlers |
-| `api/internal/api/v1/handlers.go` | Add new routes |
-| `api/internal/apps/apps.go` | Handle colored namespaces, activeDeployment |
-
-### CLI
-| File | Change |
-|------|--------|
-| `cli/cmd/backup.go` | Add switch, cleanup, recovery-plan commands |
-
-### Web App
-| File | Change |
-|------|--------|
-| `web/src/services/api/backups.ts` | Add RecoveryPlan types and API methods |
-| `web/src/components/backup/BackupCard.tsx` | Show plan status, add switch/cleanup buttons |
-| `web/src/components/backup/BackupDetailsModal.tsx` | Show per-strategy plan details |
-
-## Migration
-
-Existing backups (BackupInfo records without RecoveryPlan) continue to work:
- `BackupAppList` returns both old BackupInfo and new RecoveryPlan formats
- Old backups can still be restored via the old flow until fully migrated
- New backups create RecoveryPlans alongside BackupInfo during transition (Phase 2, step 7)
- After Phase 8, old format is dropped
-
-Existing deployed apps (bare namespace, no `activeDeployment`):
- Treated as blue by default
- No migration needed — first backup records `activeColor: blue`
- First restore creates the green variant naturally
--- a/docs/design/app-states.md
+++ b/docs/design/app-states.md
--- a/docs/design/backup-system.md
+++ b/docs/design/backup-system.md
--- a/docs/guides/cluster-networking-health.md
+++ b/docs/guides/cluster-networking-health.md
@@ -4,33 +4,101 @@ Verifying every item on this list confirms the full networking stack is function

 ## Node Layer

-1. **All nodes Ready** — no cordons, no taints (e.g. `maintenance:NoExecute`)
+1. **All nodes Ready** — no cordons, no taints (e.g., `maintenance:NoExecute`)
+   ```bash
+   kubectl get nodes
+   wild node list
+   ```
+
 2. **Flannel pods running on every node** — stale VXLAN tunnels break cross-node pod traffic
+   ```bash
+   kubectl get pods -n kube-system -l app=flannel -o wide
+   ```
+
 3. **Cross-node pod connectivity** — pods on each worker can reach pods on every other node

 ## Service Routing

 4. **kube-proxy pods running on every node** — nftables rules route ClusterIP traffic to pod endpoints
+   ```bash
+   kubectl get pods -n kube-system -l k8s-app=kube-proxy -o wide
+   ```
+
 5. **CoreDNS pods running and resolving** — both cluster-internal names (`*.svc.cluster.local`) and external names
+   ```bash
+   kubectl get pods -n kube-system -l k8s-app=kube-dns
+   ```
+
 6. **CoreDNS upstream reachability** — Talos DNS proxy at `169.254.116.108` responding from all nodes

 ## Load Balancing

 7. **MetalLB speakers running on all nodes** — L2 ARP announcements for LoadBalancer IPs
+   ```bash
+   kubectl get pods -n metallb-system -l component=speaker -o wide
+   ```
+
 8. **MetalLB ServiceL2Status resources valid** — `status.node` matches actual pod placement (stale entries block announcements)
+   ```bash
+   kubectl get servicel2statuses.metallb.io -n metallb-system
+   ```
+
 9. **LoadBalancer IPs reachable** — Traefik LB IP responds from LAN
+   ```bash
+   kubectl get svc -n traefik
+   curl -k https://<traefik-lb-ip>
+   ```

 ## Ingress & Security

 10. **Traefik ingress routing** — forwards to backend services, TLS termination working
+    ```bash
+    kubectl get pods -n traefik
+    kubectl logs -n traefik -l app=traefik | tail -20
+    ```
+
 11. **CrowdSec LAPI running** — can reach `api.crowdsec.net` (depends on CoreDNS external resolution)
+    ```bash
+    kubectl get pods -n crowdsec
+    ```
+
 12. **CrowdSec bouncer registered with LAPI** — unregistered bouncer blocks all forwardAuth requests
+    ```bash
+    wild service logs crowdsec | grep bouncer
+    ```

 ## Storage

 13. **Longhorn managers running on all workers** — enables volume replica scheduling and rebuilds
+    ```bash
+    kubectl get pods -n longhorn-system -l app=longhorn-manager -o wide
+    ```
+
 14. **Longhorn volume replicas healthy** — all volumes at target replica count across nodes
+    ```bash
+    kubectl get volumes.longhorn.io -n longhorn-system
+    ```
+
+## External DNS & Certificates
+
+15. **ExternalDNS pod running** — creating and updating DNS records at Cloudflare
+    ```bash
+    kubectl get pods -n externaldns
+    ```
+
+16. **cert-manager pods running** — issuing and renewing TLS certificates
+    ```bash
+    kubectl get pods -n cert-manager
+    kubectl get certificates -n cert-manager
+    ```

 ## LAN DNS

-15. **dnsmasq on Wild Central** — resolves LAN-local domains to correct LoadBalancer IPs (hairpin NAT)
+17. **dnsmasq on Wild Central** — resolves LAN-local domains to correct LoadBalancer IPs (hairpin NAT)
+    ```bash
+    wild dns status
+    ```
+
+## Quick Full Check
+
+Run `wild cluster health` for an automated check of the most critical items. For a comprehensive check, walk through each item above.
--- a/docs/guides/disaster-recovery.md
+++ b/docs/guides/disaster-recovery.md
@@ -0,0 +1,246 @@
+# Disaster Recovery
+
+This guide covers recovering a Wild Cloud cluster after catastrophic failure — hardware death, corrupted storage, or any scenario where you need to rebuild from scratch.
+
+## What You Need
+
+To rebuild a cluster you need two things:
+
+1. **Cluster config backup** — The tar.gz archive from Wild Cloud's cluster config backup feature, containing kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos node configs.
+2. **App backups** — The per-app backup archives (database dumps, PVC snapshots, config files) stored at your backup destination (S3, NFS, or local).
+
+If your instance data directory was a git repository (recommended), you also have the full history of compiled manifests and config.yaml in git. The git repo alone is enough to redeploy apps — but without secrets.yaml and kubeconfig, you can't authenticate to the cluster or decrypt app secrets.
+
+## Recovery Scenarios
+
+### Scenario 1: Wild Central Device Failure (Cluster Intact)
+
+The Raspberry Pi or server running Wild Central died, but the Kubernetes cluster nodes are still running.
+
+**Steps:**
+
+1. **Set up a new Wild Central device**:
+   ```bash
+   sudo dpkg -i wild-cloud-central_*.deb
+   sudo systemctl enable wild-cloud-central
+   ```
+
+2. **Restore your data directory** from git (for manifests and config) plus your cluster config backup (for secrets and credentials):
+   ```bash
+   # Clone instance data from git
+   git clone https://your-git-server/wild-cloud-data.git /var/lib/wild-central
+
+   # Extract cluster config backup over the top
+   # This restores kubeconfig, secrets.yaml, talosconfig, etc.
+   tar -xzf cluster-config-backup.tar.gz -C /var/lib/wild-central/instances/your-instance/
+   ```
+
+3. **Start Wild Central**:
+   ```bash
+   sudo systemctl start wild-cloud-central
+   ```
+
+4. **Verify connectivity**:
+   ```bash
+   wild instance use your-instance
+   wild cluster status
+   ```
+
+The cluster is still running — your apps are live. Wild Central is just the management plane.
+
+### Scenario 2: Single Node Failure (Cluster Degraded)
+
+One or more nodes died but the cluster still has quorum (at least 2 of 3 control plane nodes, or workers are replaceable).
+
+**Steps:**
+
+1. **Check cluster health** from Wild Central:
+   ```bash
+   talosctl --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     health --nodes <surviving-node-ip>
+   ```
+
+2. **Remove the dead node** from the cluster:
+   ```bash
+   # Remove from Kubernetes
+   kubectl --kubeconfig /var/lib/wild-central/instances/your-instance/kubeconfig \
+     delete node <dead-node-name>
+
+   # Remove from etcd (if control plane node)
+   talosctl --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     etcd remove-member <dead-node-name> --nodes <surviving-node-ip>
+   ```
+
+3. **PXE boot a replacement node** using Wild Central's PXE service, or manually install Talos Linux on the new hardware.
+
+4. **Add the new node** through the Wild Cloud web UI or CLI:
+   ```bash
+   wild node add --role worker --ip <new-node-ip>
+   ```
+
+5. **Verify workloads reschedule** to the new node:
+   ```bash
+   kubectl get pods --all-namespaces -o wide
+   ```
+
+### Scenario 3: Total Cluster Loss (Rebuild from Scratch)
+
+All nodes are gone. You need to rebuild everything.
+
+**Prerequisites:**
+- New hardware (or repaired existing hardware) with network boot capability or Talos Linux installed
+- Your cluster config backup (tar.gz with kubeconfig, talosconfig, secrets.yaml, Talos configs)
+- Access to your backup destination (S3 bucket, NFS share, etc.)
+- Your instance data git repo (if available — contains compiled manifests)
+
+**Steps:**
+
+1. **Set up Wild Central** on a fresh device:
+   ```bash
+   sudo dpkg -i wild-cloud-central_*.deb
+   ```
+
+2. **Restore your data directory**:
+   ```bash
+   # If you have a git repo:
+   git clone https://your-git-server/wild-cloud-data.git /var/lib/wild-central
+
+   # Extract cluster config over the top:
+   tar -xzf cluster-config-backup.tar.gz -C /var/lib/wild-central/instances/your-instance/
+   ```
+
+   If you don't have a git repo, just extract the cluster config backup into a fresh instance directory. You'll re-add apps from the Wild Directory.
+
+3. **Bootstrap new Talos nodes** using the restored Talos configs:
+   ```bash
+   # Apply control plane config to the first node
+   talosctl apply-config \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <node-ip> \
+     --file /var/lib/wild-central/instances/your-instance/talos/generated/controlplane.yaml \
+     --insecure
+   ```
+
+   The restored `controlplane.yaml` and `worker.yaml` contain your cluster's identity (cluster name, secrets, certificates). Using them ensures the new cluster has the same identity as the old one.
+
+4. **Bootstrap the cluster**:
+   ```bash
+   talosctl bootstrap \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <first-control-plane-ip>
+   ```
+
+5. **Wait for the cluster to be healthy**:
+   ```bash
+   talosctl health \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <first-control-plane-ip>
+   ```
+
+6. **Update kubeconfig** (the new cluster may issue a fresh kubeconfig):
+   ```bash
+   talosctl kubeconfig \
+     --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
+     --nodes <first-control-plane-ip> \
+     /var/lib/wild-central/instances/your-instance/kubeconfig
+   ```
+
+7. **Deploy infrastructure services first** (order matters):
+   ```bash
+   wild instance use your-instance
+   wild service install metallb
+   wild service install traefik
+   wild service install cert-manager
+   wild service install external-dns
+   wild service install longhorn    # If using Longhorn for PVCs
+   ```
+
+8. **Deploy apps** (dependencies first, then apps):
+   ```bash
+   # Deploy database services first
+   wild app deploy pg
+   wild app deploy redis
+
+   # Then deploy apps
+   wild app deploy gitea
+   wild app deploy immich
+   # ... etc
+   ```
+
+   If your git repo has compiled manifests, these deploys apply the exact same manifests that were running before. If not, you'll need to re-add apps from the Wild Directory first:
+   ```bash
+   wild app add gitea
+   wild app deploy gitea
+   ```
+
+9. **Restore app data from backups**:
+   ```bash
+   # Restore each app's data (database + PVC) from the backup destination
+   # Use the Web UI: navigate to Backups > [app] > Restore
+   # Or via CLI:
+   wild restore gitea --auto
+   wild restore immich --auto
+   ```
+
+   The `--auto` flag runs the full blue-green restore cycle: restore to standby, switch traffic, then clean up the old namespace. For more control, run each phase separately — see [Restoring Backups](restoring-backups.md).
+
+10. **Verify everything is working**:
+    ```bash
+    wild app status gitea
+    wild app status immich
+    kubectl get pods --all-namespaces
+    ```
+
+## Cluster Config Backup
+
+The cluster config backup feature archives the files that are NOT tracked in git — the credentials and secrets needed to access the cluster.
+
+### What Gets Backed Up
+
+| File | Purpose |
+|------|---------|
+| `kubeconfig` | Kubernetes API credentials |
+| `config.yaml` | Full instance configuration |
+| `secrets.yaml` | App secrets (database passwords, API keys) |
+| `talos/generated/talosconfig` | Talos API credentials |
+| `talos/generated/controlplane.yaml` | Control plane node config |
+| `talos/generated/worker.yaml` | Worker node config |
+| `talos/generated/secrets.yaml` | Talos bootstrap secrets (cluster identity) |
+
+### Creating Cluster Config Backups
+
+**Web UI:** Navigate to Backups, click "Backup" on the "Cluster Config" row.
+
+**CLI:**
+```bash
+# Via API
+curl -X POST http://localhost:5055/api/v1/instances/your-instance/backup/cluster
+```
+
+**Scheduled:** Create a backup schedule with target type "cluster" to automatically back up cluster config on a recurring basis. See [Making Backups](making-backups.md) for scheduling details.
+
+### Downloading a Cluster Config Backup
+
+Cluster config backups are stored at your configured backup destination under the key `cluster-config/{instance}/{timestamp}.tar.gz`. To retrieve one:
+
+- **S3/Azure:** Download from the bucket/container using your cloud provider's CLI
+- **NFS:** Navigate to the NFS mount point and find the archive
+- **Local:** Find it at `{data-dir}/instances/{instance}/backups/cluster-config/...`
+
+Store a copy of the latest cluster config backup in a secure offsite location (encrypted USB drive, password manager, separate cloud storage). If your primary backup destination is on the cluster itself, a total cluster loss takes the backups with it.
+
+## Prevention Checklist
+
+- [ ] **Cluster config backups** are scheduled and running
+- [ ] **App backups** are scheduled for all critical apps
+- [ ] **Backup destination** is offsite or on separate infrastructure from the cluster
+- [ ] **Instance data directory** is pushed to a git remote (excludes secrets.yaml)
+- [ ] **Cluster config backup archive** is stored in a second location (not just on the cluster)
+- [ ] **Test a restore** periodically — backups are worthless if restore doesn't work
+
+## Related Guides
+
+- [Making Backups](making-backups.md) — Setting up backup destinations and schedules
+- [Restoring Backups](restoring-backups.md) — Blue-green restore process in detail
+- [Upgrade Talos](upgrade-talos.md) — Talos node upgrade and rollback
+- [Troubleshoot Cluster](troubleshoot-cluster.md) — Diagnosing cluster issues after recovery
--- a/docs/guides/making-backups.md
+++ b/docs/guides/making-backups.md
@@ -1,265 +1,250 @@
 # Making Backups

-This guide covers how to create backups of your wild-cloud infrastructure using the integrated backup system.
+This guide covers how to create backups of your Wild Cloud applications and cluster configuration.

 ## Overview

-The wild-cloud backup system creates encrypted, deduplicated snapshots using restic. It backs up three main components:
+Wild Cloud's backup system creates backups using native tools for each data type:

- **Applications**: Database dumps and persistent volume data
- **Cluster**: Kubernetes resources and etcd state
- **Configuration**: Wild-cloud repository and settings
+- **PostgreSQL databases**: `pg_dump` in custom compressed format
+- **MySQL databases**: `mysqldump` with gzip compression
+- **Persistent volumes**: Longhorn native backup API
+- **Configuration**: tar.gz archives of manifests, config, and secrets
+
+Backups are stored at a configured destination (S3, Azure Blob, NFS, or local filesystem) and tracked via recovery plans that coordinate the full backup-restore lifecycle.

 ## Prerequisites

 Before making backups, ensure you have:

-1. **Environment configured**: Run `source env.sh` to load backup configuration
-2. **Restic repository**: Backup repository configured in `config.yaml`
-3. **Backup password**: Set in wild-cloud secrets
-4. **Staging directory**: Configured path for temporary backup files
+1. **A backup destination configured** — S3 bucket, Azure container, NFS share, or local path
+2. **Longhorn backup target** configured if backing up persistent volumes
+3. **kubectl access** to your cluster

-## Backup Components
+## Configuring Backup Destination

-### Applications (`wild-app-backup`)
+### Web UI

-Backs up individual applications including:
- **Database dumps**: PostgreSQL/MySQL databases in compressed custom format
- **PVC data**: Application files streamed directly for restic deduplication
- **Auto-discovery**: Finds databases and PVCs based on app manifest.yaml
+Navigate to **Backups** and click **Settings** to configure your backup destination and retention policy.

-### Cluster Resources (`wild-backup --cluster-only`)
+### CLI

-Backs up cluster-wide resources:
- **Kubernetes resources**: All pods, services, deployments, secrets, configmaps
- **Storage definitions**: PersistentVolumes, PVCs, StorageClasses  
- **etcd snapshot**: Complete cluster state for disaster recovery
+Backup configuration is stored in your instance's `config.yaml` under the `backup:` section. Credentials are stored in `secrets.yaml`.

-### Configuration (`wild-backup --home-only`)
+Example configuration:

-Backs up wild-cloud configuration:
- **Repository contents**: All app definitions, manifests, configurations
- **Settings**: Wild-cloud configuration files and customizations
+```yaml
+# config.yaml
+backup:
+  destination:
+    type: "s3"  # "s3", "azure", "nfs", or "local"
+    s3:
+      bucket: "my-backups"
+      region: "us-east-1"
+      endpoint: "minio.example.com"  # Optional, for S3-compatible services
+  retention:
+    daily: 7
+    weekly: 4
+    monthly: 6
+    yearly: 1
+```
+
+```yaml
+# secrets.yaml
+backup:
+  s3:
+    accessKeyId: "..."
+    secretAccessKey: "..."
+```
+
+### Supported Destinations
+
+| Destination | Config Fields | Notes |
+|-------------|--------------|-------|
+| **local** | `path` | Default: `instances/{instance}/backups` |
+| **s3** | `bucket`, `region`, `endpoint`, `accessKeyId`, `secretAccessKey` | Supports S3-compatible services like MinIO |
+| **azure** | `container`, `storageAccount`, `accessKey` | Azure Blob Storage |
+| **nfs** | `server`, `path`, `mountPoint`, `mountOptions` | Auto-recovers stale mounts |

 ## Making Backups

-### Full System Backup (Recommended)
+### Single App Backup

-Create a complete backup of everything:
+**Web UI:** Navigate to **Backups > [app]** and click **Backup Now**.
+
+**CLI:**
+```bash
+# Backup a single app
+wild backup start gitea
+
+# Shorthand
+wild backup gitea
+```
+
+### All Apps Backup
+
+**CLI:**
+```bash
+# Backup all deployed apps
+wild backup all
+```
+
+### Cluster Config Backup
+
+Cluster config backups archive the credentials and secrets not tracked in git — kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos generated configs.
+
+**Web UI:** Navigate to **Backups** and click **Backup** on the Cluster Config row.
+
+**CLI / API:**
+```bash
+curl -X POST http://localhost:5055/api/v1/instances/{instance}/backup/cluster
+```
+
+## What Gets Backed Up
+
+### Application Backups
+
+The backup system auto-discovers what to back up based on each app's manifest:
+
+| Component | Tool | Format | Storage Key |
+|-----------|------|--------|-------------|
+| PostgreSQL database | `pg_dump` | Custom binary (compression level 9) + globals SQL | `postgres/{instance}/{app}/{timestamp}.dump` |
+| MySQL database | `mysqldump` | Gzip-compressed SQL | `mysql/{instance}/{app}/{timestamp}.sql.gz` |
+| Persistent volumes | Longhorn native API | Longhorn backup format | Stored in Longhorn backup target |
+| App config & manifests | tar + gzip | tar.gz archive | `config/{instance}/{app}/{timestamp}.tar.gz` |
+
+Cache volumes (names containing `-cache` or `-tmp`) and cache databases (Redis, Memcached) are automatically excluded.
+
+### Cluster Config Backups
+
+| File | Purpose |
+|------|---------|
+| `kubeconfig` | Kubernetes API credentials |
+| `config.yaml` | Full instance configuration |
+| `secrets.yaml` | App secrets (database passwords, API keys) |
+| `talos/generated/talosconfig` | Talos API credentials |
+| `talos/generated/controlplane.yaml` | Control plane node config |
+| `talos/generated/worker.yaml` | Worker node config |
+| `talos/generated/secrets.yaml` | Talos bootstrap secrets (cluster identity) |
+
+**Storage key:** `cluster-config/{instance}/{timestamp}.tar.gz`
+
+## Discovering Backup Resources
+
+Before backing up for the first time, you can discover what persistent data an app has:

 ```bash
-# Backup all components (apps + cluster + config)
-wild-backup
+wild backup discover gitea
 ```

-This is equivalent to:
+This analyzes the app's manifest and kustomize resources to find databases and PVCs, showing what will be backed up and what will be skipped.
+
+## Scheduled Backups
+
+### Creating a Schedule
+
+**Web UI:** Navigate to **Backups > [app]** and click **Schedule**.
+
+**CLI:**
 ```bash
-wild-backup --home --apps --cluster
+# Daily backup at 2 AM
+wild backup schedule create gitea --frequency daily --time 02:00
+
+# Weekly backup on Sunday at 3 AM
+wild backup schedule create gitea --frequency weekly --time 03:00 --day-of-week 0
+
+# Monthly backup on the 1st at midnight
+wild backup schedule create gitea --frequency monthly --time 00:00 --day-of-month 1
 ```

-### Selective Backups
-
-#### Applications Only
-```bash
-# All applications
-wild-backup --apps-only
-
-# Single application  
-wild-app-backup discourse
-
-# Multiple applications
-wild-app-backup discourse gitea immich
-```
-
-#### Cluster Only
-```bash
-# Kubernetes resources + etcd
-wild-backup --cluster-only
-```
-
-#### Configuration Only
-```bash  
-# Wild-cloud repository
-wild-backup --home-only
-```
-
-### Excluding Components
-
-Skip specific components:
+### Managing Schedules

 ```bash
-# Skip config, backup apps + cluster
-wild-backup --no-home
+# List all schedules
+wild backup schedule list

-# Skip applications, backup config + cluster  
-wild-backup --no-apps
+# Enable/disable a schedule
+wild backup schedule enable <schedule-id>
+wild backup schedule disable <schedule-id>

-# Skip cluster resources, backup config + apps
-wild-backup --no-cluster
+# Manually trigger a schedule
+wild backup schedule run <schedule-id>
+
+# Delete a schedule
+wild backup schedule delete <schedule-id>
 ```

-## Backup Process Details
+Retention is enforced automatically after each scheduled backup completes.

-### Application Backup Process
-
-1. **Discovery**: Parses `manifest.yaml` to find database and PVC dependencies
-2. **Database backup**: Creates compressed custom-format dumps
-3. **PVC backup**: Streams files directly to staging for restic deduplication  
-4. **Staging**: Organizes files in clean directory structure
-5. **Upload**: Creates individual restic snapshots per application
-
-### Cluster Backup Process
-
-1. **Resource export**: Exports all Kubernetes resources to YAML
-2. **etcd snapshot**: Creates point-in-time etcd backup via talosctl
-3. **Upload**: Creates single restic snapshot for cluster state
-
-### Restic Snapshots
-
-Each backup creates tagged restic snapshots:
+## Listing and Verifying Backups

 ```bash
-# View all snapshots
-restic snapshots
+# List backups for an app
+wild backup list gitea

-# Filter by component
-restic snapshots --tag discourse    # Specific app
-restic snapshots --tag cluster      # Cluster resources
-restic snapshots --tag wc-home      # Wild-cloud config
+# Verify a backup can be restored
+wild backup verify gitea
+
+# Verify a specific backup
+wild backup verify gitea 20250314T021530Z
 ```

-## Where Backup Files Are Staged
-
-Before uploading to your restic repository, backup files are organized in a staging directory. This temporary area lets you see exactly what's being backed up and helps with deduplication.
-
-Here's what the staging area looks like:
-
-```
-backup-staging/
-├── apps/
-│   ├── discourse/
-│   │   ├── database_20250816T120000Z.dump
-│   │   ├── globals_20250816T120000Z.sql  
-│   │   └── discourse/
-│   │       └── data/         # All the actual files
-│   ├── gitea/
-│   │   ├── database_20250816T120000Z.dump
-│   │   └── gitea-data/
-│   │       └── data/         # Git repositories, etc.
-│   └── immich/
-│       ├── database_20250816T120000Z.dump
-│       └── immich-data/
-│           └── upload/       # Photos and videos
-└── cluster/
-    ├── all-resources.yaml    # All running services
-    ├── secrets.yaml          # Passwords and certificates
-    ├── configmaps.yaml       # Configuration data
-    └── etcd-snapshot.db      # Complete cluster state
-```
-
-This staging approach means you can examine backup contents before they're uploaded, and restic can efficiently deduplicate files that haven't changed.
-
-## Advanced Usage
-
-### Custom Backup Scripts
-
-Applications can provide custom backup logic:
+## Deleting Backups

 ```bash
-# Create apps/myapp/backup.sh for custom behavior
-chmod +x apps/myapp/backup.sh
+# Delete a specific backup
+wild backup delete gitea 20250314T021530Z

-# wild-app-backup will use custom script if present
-wild-app-backup myapp
+# Skip confirmation
+wild backup delete gitea 20250314T021530Z --yes
 ```

-### Monitoring Backup Status
+## Backup Health

+Check the overall health of your backup system:
+
+**Web UI:** The **Backups** page shows a health summary across all apps — backup count, last backup time, scheduled status, and total size.
+
+**API:**
 ```bash
-# Check recent snapshots
-restic snapshots | head -20
-
-# Check specific app backups
-restic snapshots --tag discourse
-
-# Verify backup integrity
-restic check
+curl http://localhost:5055/api/v1/instances/{instance}/backup/health
 ```

-### Backup Automation
+## Recovery Plans

-Set up automated backups with cron:
+Each backup creates a recovery plan (`recovery-plan.yaml`) that tracks the backup's contents and coordinates restore operations. The plan records what strategies were used, where data is stored, and the current lifecycle status.

-```bash
-# Daily full backup at 2 AM
-0 2 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup
-
-# Hourly app backups during business hours  
-0 9-17 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup --apps-only
-```
-
-## Performance Considerations
-
-### Large PVCs (like Immich photos)
-
-The streaming backup approach provides:
- **First backup**: Full transfer time (all files processed)
- **Subsequent backups**: Only changed files processed (dramatically faster)
- **Storage efficiency**: Restic deduplication reduces storage usage
-
-### Network Usage
-
- **Database dumps**: Compressed at source, efficient transfer
- **PVC data**: Uncompressed transfer, but restic handles deduplication
- **etcd snapshots**: Small files, minimal impact
+Plan statuses progress through: `backing_up` -> `backed_up` -> (restore phases when used).

 ## Troubleshooting

-### Common Issues
+### "No databases or PVCs found"
+- The app has no database dependencies in its `manifest.yaml`
+- No PVCs with matching labels exist in the app namespace
+- Run `wild backup discover <app>` to see what's detected

-**"No databases or PVCs found"**
- App has no `manifest.yaml` with database dependencies
- No PVCs with matching labels in app namespace
- Create custom `backup.sh` script for special cases
+### Longhorn backup fails
+- Verify Longhorn backup target is configured (`kubectl get settings -n longhorn-system backup-target`)
+- Check Longhorn manager pods are running on all worker nodes
+- Ensure sufficient storage at the backup target

-**"kubectl not found"** 
- Ensure kubectl is installed and configured
- Check cluster connectivity with `kubectl get nodes`
+### Database dump fails
+- Verify the database pod is running: `kubectl get pods -n postgres`
+- Check that the database name in `config.yaml` matches the actual database

-**"Staging directory not set"**
- Configure `cloud.backup.staging` in `config.yaml`
- Ensure directory exists and is writable
-
-**"Could not create etcd backup"**
- Ensure `talosctl` is installed for Talos clusters
- Check control plane node connectivity
- Verify etcd pods are accessible in kube-system namespace
-
-### Backup Verification
-
-Always verify backups periodically:
-
-```bash
-# Check restic repository integrity
-restic check
-
-# List recent snapshots
-restic snapshots --compact
-
-# Test restore to different directory
-restic restore latest --target /tmp/restore-test
-```
+### Scheduled backups not running
+- Verify the schedule is enabled: `wild backup schedule list`
+- Check the Wild Central API is running: `wild daemon status`

 ## Security Notes

- **Encryption**: All backups are encrypted with your backup password
- **Secrets**: Kubernetes secrets are included in cluster backups
- **Access control**: Secure your backup repository and passwords
- **Network**: Consider bandwidth usage for large initial backups
+- **Encryption**: S3 and Azure destinations support server-side encryption. Configure bucket/container encryption policies at your cloud provider.
+- **Secrets**: Database credentials and API keys are included in cluster config backups. Store these backups securely.
+- **Access control**: Restrict access to your backup destination. Cluster config backups contain everything needed to access your cluster.

 ## Next Steps

- [Restoring Backups](restoring-backups.md) - Learn how to restore from backups
- Configure automated backup schedules
- Set up backup monitoring and alerting
- Test disaster recovery procedures
+- [Restoring Backups](restoring-backups.md) — Learn how to restore from backups using blue-green deployment
+- [Disaster Recovery](disaster-recovery.md) — Full cluster rebuild procedures
+- Set up scheduled backups for all critical apps
+- Store cluster config backups in a second location (not on the cluster itself)
--- a/docs/guides/monitoring.md
+++ b/docs/guides/monitoring.md
@@ -1,50 +1,209 @@
 # System Health Monitoring

-## Basic Monitoring
+This guide covers how to monitor the health of your Wild Cloud cluster, nodes, and applications.

-Check system health with:
+## Dashboard Overview
+
+The Wild Cloud web app dashboard provides an at-a-glance view of your cluster:
+
+- Cluster health status with individual health checks
+- Node count and status (control plane and worker)
+- Kubernetes and Talos versions
+- Running operations summary
+- Active app count
+
+Navigate to your instance's **Dashboard** page for this overview.
+
+## Cluster Health
+
+### Web UI
+
+The **Dashboard** page runs automated health checks covering:
+
+- Control plane readiness
+- Worker node readiness
+- etcd health
+- Networking health
+- Storage health
+
+Each check shows pass/fail status with detailed messages.
+
+### CLI
+
+```bash
+# Quick cluster health check
+wild cluster health
+
+# Cluster status overview
+wild cluster status
+
+# Check overall system health
+wild health
+```
+
+### API
+
+```bash
+# Detailed health checks
+curl http://localhost:5055/api/v1/instances/{instance}/cluster/health
+
+# Cluster status
+curl http://localhost:5055/api/v1/instances/{instance}/cluster/status
+```
+
+## Node Monitoring
+
+### Web UI
+
+The **Cluster** page shows all nodes with:
+
+- Status indicators (Ready, NotReady, maintenance)
+- Role (control plane / worker)
+- Hardware info (CPU, memory, storage)
+- Talos version
+- Current and target IP addresses
+
+### CLI
+
+```bash
+# List all nodes with status
+wild node list
+
+# Detailed node info
+wild node show <hostname>
+```
+
+### kubectl

 ```bash
 # Node resource usage
 kubectl top nodes

-# Pod resource usage
-kubectl top pods -A
-
-# Persistent volume claims
-kubectl get pvc -A
+# Node status and conditions
+kubectl get nodes -o wide
+kubectl describe node <node-name>
 ```

-## Advanced Monitoring (Future Implementation)
+## Application Monitoring

-Consider implementing:
+### Web UI

-1. **Prometheus + Grafana** for comprehensive monitoring:
-   ```bash
-   # Placeholder for future implementation
-   helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
-   helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace
-   ```
+The **Apps > Installed** page shows all deployed apps with real-time status (running, unhealthy, no-pods, error). Click an app for detailed information including pod status, resource usage, and logs.

-2. **Loki** for log aggregation:
-   ```bash
-   # Placeholder for future implementation
-   helm repo add grafana https://grafana.github.io/helm-charts
-   helm install loki grafana/loki-stack --namespace logging --create-namespace
-   ```
+### CLI

-## Additional Resources
+```bash
+# List deployed apps with status
+wild app list-deployed

-This document will be expanded in the future with:
+# Detailed app status
+wild app status <app>

- Detailed backup and restore procedures
- Monitoring setup instructions
- Comprehensive security hardening guide
- Automated maintenance scripts
+# View app logs
+wild service logs <app> --follow
+```

-For now, refer to the following external resources:
+### kubectl

- [K3s Documentation](https://docs.k3s.io/)
- [Kubernetes Troubleshooting Guide](https://kubernetes.io/docs/tasks/debug/)
- [Velero Backup Documentation](https://velero.io/docs/latest/)
- [Kubernetes Security Best Practices](https://kubernetes.io/docs/concepts/security/)
+```bash
+# Pod resource usage across all namespaces
+kubectl top pods -A
+
+# Pods not in Running/Completed state
+kubectl get pods -A | grep -v "Running\|Completed"
+
+# Events for a specific app
+kubectl get events -n <app-namespace> --sort-by='.lastTimestamp'
+```
+
+## Backup Health
+
+### Web UI
+
+The **Backups** page shows a health summary across all apps:
+
+- Total backup count and size
+- Last backup time for each app
+- Whether scheduled backups are configured
+- Failed backup indicators
+
+### CLI / API
+
+```bash
+# Backup health overview
+curl http://localhost:5055/api/v1/instances/{instance}/backup/health
+```
+
+## Operations Monitoring
+
+Long-running operations (deployments, backups, restores, node upgrades) are tracked by the operations system.
+
+### Web UI
+
+The **Operations** page shows all operations with filtering by status (running, completed, failed) and real-time progress updates.
+
+### CLI
+
+```bash
+# List recent operations
+wild operation list
+
+# Check a specific operation
+wild operation get <operation-id>
+```
+
+## Storage Monitoring
+
+### Persistent Volumes
+
+```bash
+# Check PVC status and usage across all namespaces
+kubectl get pvc -A
+
+# Longhorn volume health
+kubectl get volumes.longhorn.io -n longhorn-system
+```
+
+### Longhorn Dashboard
+
+If Headlamp is installed, you can access the Kubernetes dashboard for detailed Longhorn volume information. Alternatively, access the Longhorn UI directly if its ingress is configured.
+
+## DNS Health
+
+```bash
+# Check dnsmasq status on Wild Central
+wild dns status
+
+# View current DNS configuration
+wild dns config
+
+# Test internal DNS resolution
+kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
+  nslookup kubernetes.default.svc.cluster.local
+```
+
+## Key Health Indicators
+
+| Component | Healthy Sign | Warning Sign |
+|-----------|-------------|--------------|
+| Nodes | All Ready, no taints | NotReady, cordoned, or tainted |
+| Pods | Running/Completed | CrashLoopBackOff, Pending, Evicted |
+| PVCs | Bound | Pending, Lost |
+| Longhorn volumes | Healthy, target replica count | Degraded, faulted, rebuilding |
+| Backups | Recent, scheduled | No recent backup, failed |
+| etcd | Healthy cluster members | Member unreachable, high latency |
+| MetalLB | All speakers running | Missing speakers, stale L2 status |
+
+## Setting Up Alerts
+
+Wild Cloud does not currently include a built-in alerting system. For production environments, consider:
+
+1. **Backup scheduling** with verification to catch backup failures early
+2. **Periodic health checks** via `wild cluster health` in a cron job
+3. **External monitoring** pointing at your app URLs for uptime checks
+
+## Next Steps
+
+- [Cluster Networking Health](cluster-networking-health.md) — Detailed networking health checklist
+- [Troubleshoot Cluster](troubleshoot-cluster.md) — When health checks fail
+- [Making Backups](making-backups.md) — Set up backup schedules
--- a/docs/guides/restoring-backups.md
+++ b/docs/guides/restoring-backups.md
@@ -1,294 +1,277 @@
 # Restoring Backups

-This guide will walk you through restoring your applications and cluster from wild-cloud backups. Hopefully you'll never need this, but when you do, it's critical that the process works smoothly.
+This guide covers how to restore applications from Wild Cloud backups. The restore system uses a blue-green deployment model — data is restored to a standby namespace so you can verify it before switching traffic.

-## Understanding Restore Types
+## Understanding the Blue-Green Restore

-Your wild-cloud backup system can restore different types of data depending on what you need to recover:
+Wild Cloud restores follow a three-phase process:

-**Application restores** bring back individual applications by restoring their database contents and file storage. This is what you'll use most often - maybe you accidentally deleted something in Discourse, or Gitea got corrupted, or you want to roll back Immich to before a bad update.
+```
+Restore → Switch → Cleanup
+```

-**Cluster restores** are for disaster recovery scenarios where you need to rebuild your entire Kubernetes cluster from scratch. This includes restoring all the cluster's configuration and even its internal state.
+1. **Restore**: Creates a standby namespace with restored data. Your active app keeps running untouched.
+2. **Switch**: Redirects traffic from the active deployment to the standby. The standby becomes the new active.
+3. **Cleanup**: Removes the previous active deployment and resources.

-**Configuration restores** bring back your wild-cloud repository and settings, which contain all the "recipes" for how your infrastructure should be set up.
+This means restores are safe — if something goes wrong, your active app is still running.

-## Before You Start Restoring
+## Before You Start

-Make sure you have everything needed to perform restores. You need to be in your wild-cloud directory with the environment loaded (`source env.sh`). Your backup repository and password should be configured and working - you can test this by running `restic snapshots` to see your available backups.
+Make sure you have:
+- kubectl access to your cluster
+- Backup destination accessible (same destination where backups were stored)
+- The app deployed (or at least added) to your instance

-Most importantly, make sure you have kubectl access to your cluster, since restores involve creating temporary pods and manipulating storage.
+List available backups first:
+
+```bash
+wild backup list gitea
+```

 ## Restoring Applications

-### Basic Application Restore
+### Full Restore (Automatic)

-The most common restore scenario is bringing back a single application. To restore the latest backup of an app:
+The simplest approach runs all three phases automatically:

 ```bash
-wild-app-restore discourse
+wild restore gitea --auto
 ```

-This restores both the database and all file storage for the discourse app. The restore system automatically figures out what the app needs based on its manifest file and what was backed up.
+This restores the latest backup, switches traffic, and cleans up the old deployment in one operation.

-If you want to restore from a specific backup instead of the latest:
+### Full Restore from Specific Backup

 ```bash
-wild-app-restore discourse abc123
+wild restore gitea 20250314T021530Z --auto
 ```

-Where `abc123` is the snapshot ID from `restic snapshots --tag discourse`.
+### Step-by-Step Restore (Recommended for Critical Apps)

-### Partial Restores
+For production apps, run each phase separately so you can verify between steps:

-Sometimes you only need to restore part of an application. Maybe the database is fine but the files got corrupted, or vice versa.
-
-To restore only the database:
-```bash
-wild-app-restore discourse --db-only
-```
-
-To restore only the file storage:
-```bash
-wild-app-restore discourse --pvc-only
-```
-
-To restore without database roles and permissions (if they're causing conflicts):
-```bash
-wild-app-restore discourse --skip-globals
-```
-
-### Finding Available Backups
-
-To see what backups are available for an app:
-```bash
-wild-app-restore discourse --list
-```
-
-This shows recent snapshots with their IDs, timestamps, and what was included.
-
-## How Application Restores Work
-
-Understanding what happens during a restore can help when things don't go as expected.
-
-### Database Restoration
-
-When restoring a database, the system first downloads the backup files from your restic repository. It then prepares the database by creating any needed roles, disconnecting existing users, and dropping/recreating the database to ensure a clean restore.
-
-For PostgreSQL databases, it uses `pg_restore` with parallel processing to speed up large database imports. For MySQL, it uses standard mysql import commands. The system also handles database ownership and permissions automatically.
-
-### File Storage Restoration
-
-File storage (PVC) restoration is more complex because it involves safely replacing files that might be actively used by running applications.
-
-First, the system creates a safety snapshot using Longhorn. This means if something goes wrong during the restore, you can get back to where you started. Then it scales your application down to zero replicas so no pods are using the storage.
-
-Next, it creates a temporary utility pod with the PVC mounted and copies all the backup files into place, preserving file permissions and structure. Once the data is restored and verified, it removes the utility pod and scales your application back up.
-
-If everything worked correctly, the safety snapshot is automatically deleted. If something went wrong, the safety snapshot is preserved so you can recover manually.
-
-## Cluster Disaster Recovery
-
-Cluster restoration is much less common but critical when you need to rebuild your entire infrastructure.
-
-### Restoring Kubernetes Resources
-
-To restore all cluster resources from a backup:
+**Step 1: Restore to standby**

 ```bash
-# Download cluster backup
-restic restore --tag cluster latest --target ./restore/
-
-# Apply all resources
-kubectl apply -f restore/cluster/all-resources.yaml
+wild restore gitea
 ```

-You can also restore specific types of resources:
-```bash
-kubectl apply -f restore/cluster/secrets.yaml
-kubectl apply -f restore/cluster/configmaps.yaml
-```
+This creates a standby namespace (e.g., `gitea-green`) with the restored database and files. Your active app continues running in its current namespace.

-### Restoring etcd State
+**Step 2: Verify the standby**

-**Warning: This is extremely dangerous and will affect your entire cluster.**
-
-etcd restoration should only be done when rebuilding a cluster from scratch. For Talos clusters:
+Check that the restored app is working:

 ```bash
-talosctl --nodes <control-plane-ip> etcd restore --from ./restore/cluster/etcd-snapshot.db
+# Check pods in the standby namespace
+kubectl get pods -n gitea-green
+
+# Check logs
+kubectl logs -n gitea-green deploy/gitea
+
+# View the recovery plan
+wild restore plan gitea
 ```

-This command stops etcd, replaces its data with the backup, and restarts the cluster. Expect significant downtime while the cluster rebuilds itself.
+**Step 3: Switch traffic**

-## Common Disaster Recovery Scenarios
+```bash
+wild restore switch gitea
+```
+
+This updates the active deployment color in config.yaml and redirects traffic to the standby namespace.
+
+**Step 4: Clean up**
+
+```bash
+wild restore cleanup gitea
+```
+
+This removes the previous active namespace and resources.
+
+### Web UI
+
+Navigate to **Backups > [app]**, select a backup, and click **Restore**. The UI tracks recovery plan progress through each phase.
+
+## Partial Restores
+
+Restore only specific components:
+
+```bash
+# Database only
+wild restore gitea --components postgres
+
+# Persistent volumes only
+wild restore gitea --components pvc
+
+# Config/manifests only (skip data)
+wild restore gitea --skip-data
+
+# Multiple specific components
+wild restore gitea --components postgres,pvc
+```
+
+## How Each Component Is Restored
+
+### PostgreSQL Databases
+
+The restore creates a standby database named `{dbName}_{standbyColor}` (e.g., `gitea_green`):
+
+1. Downloads the `.dump` file from the backup destination
+2. Creates the standby database and user
+3. Runs `pg_restore` with the dump file
+4. Deploys the app to the standby namespace with kustomize patches that rewrite database connection strings to point to the standby database
+
+### MySQL Databases
+
+Similar to PostgreSQL — creates a standby database, imports the gzip-compressed SQL dump, and patches connection strings.
+
+### Persistent Volumes (Longhorn)
+
+1. Triggers a Longhorn restore from the native backup, creating new volumes with standby naming
+2. Generates kustomize patches that bind standby PVCs to the restored volumes via `spec.volumeName`
+3. Cache/temp volumes (names containing `-cache` or `-tmp`) are skipped
+
+### Configuration
+
+Extracts the tar.gz archive containing manifests, kustomization, and app-specific config/secrets to the standby app directory.
+
+## Viewing Recovery Plans
+
+Each restore operation creates a recovery plan that tracks progress across all phases:
+
+```bash
+wild restore plan gitea
+```
+
+The plan shows:
+- Current status (restoring, restored, switching, switched, cleaning_up, cleaned_up, or failed)
+- Which strategies ran (postgres, longhorn-native, config)
+- Per-strategy status and details
+- Timestamps for each phase
+
+## Common Restore Scenarios
+
+### Rolling Back After a Bad Update
+
+```bash
+# List available backups
+wild backup list gitea
+
+# Restore from before the problematic update
+wild restore gitea 20250310T020000Z --auto
+```

 ### Complete Application Loss

-When an entire application is gone (namespace deleted, pods corrupted, etc.):
+If an app's namespace was deleted or corrupted:

 ```bash
-# Make sure the namespace exists
-kubectl create namespace discourse --dry-run=client -o yaml | kubectl apply -f -
+# Make sure the app is added to the instance
+wild app add gitea

-# Apply the application manifests if needed
-kubectl apply -f apps/discourse/
+# Deploy the app (creates namespace and base resources)
+wild app deploy gitea

-# Restore the application data
-wild-app-restore discourse
+# Restore data from backup
+wild restore gitea --auto
 ```

-### Complete Cluster Rebuild
+### Database-Only Recovery

-When rebuilding a cluster from scratch:
+If the app is running but the database is corrupted:

-First, build your new cluster infrastructure and install wild-cloud components. Then configure backup access so you can reach your backup repository.
-
-Restore cluster state:
 ```bash
-restic restore --tag cluster latest --target ./restore/
-# Apply etcd snapshot using appropriate method for your cluster type
+# Restore only the database to standby
+wild restore gitea --components postgres
+
+# Verify the restored database
+kubectl exec -n postgres deploy/postgres -- \
+  psql -U postgres -d gitea_green -c "SELECT count(*) FROM repository;"
+
+# Switch to the restored database
+wild restore switch gitea
+
+# Clean up
+wild restore cleanup gitea
 ```

-Finally, restore all applications:
-```bash
-# See what applications are backed up
-wild-app-restore --list
+### Cross-Cluster Migration

-# Restore each application individually
-wild-app-restore discourse
-wild-app-restore gitea
-wild-app-restore immich
+On the source cluster:
+```bash
+wild backup gitea
 ```

-### Rolling Back After Bad Changes
-
-Sometimes you need to undo recent changes to an application:
-
+On the target cluster:
 ```bash
-# See available snapshots
-wild-app-restore discourse --list
-
-# Restore from before the problematic changes
-wild-app-restore discourse abc123
-```
-
-## Cross-Cluster Migration
-
-You can use backups to move applications between clusters:
-
-On the source cluster, create a fresh backup:
-```bash
-wild-app-backup discourse
-```
-
-On the target cluster, deploy the application manifests:
-```bash
-kubectl apply -f apps/discourse/
-```
-
-Then restore the data:
-```bash
-wild-app-restore discourse
-```
-
-## Verifying Successful Restores
-
-After any restore, verify that everything is working correctly.
-
-For databases, check that you can connect and see expected data:
-```bash
-kubectl exec -n postgres deploy/postgres-deployment -- \
-  psql -U postgres -d discourse -c "SELECT count(*) FROM posts;"
-```
-
-For file storage, check that files exist and applications can start:
-```bash
-kubectl get pods -n discourse
-kubectl logs -n discourse deployment/discourse
-```
-
-For web applications, test that you can access them:
-```bash
-curl -f https://discourse.example.com/latest.json
+wild app add gitea
+wild app deploy gitea
+wild restore gitea --auto
 ```

 ## When Things Go Wrong

-### No Snapshots Found
+### Restore Fails Mid-Way

-If the restore system can't find backups for an application, check that snapshots exist:
-```bash
-restic snapshots --tag discourse
-```
+If the restore phase fails, your active app is untouched. The standby namespace may contain partial data. You can:
+- Fix the issue and retry: `wild restore gitea`
+- Check what went wrong: `wild restore plan gitea`
+- Clean up the failed standby manually: `kubectl delete namespace gitea-green`

-Make sure you're using the correct app name and that backups were actually created successfully.
+### Switch Fails

-### Database Restore Failures
+If the switch phase fails, the standby is fully populated and ready. You can:
+- Retry the switch: `wild restore switch gitea`
+- Inspect both namespaces and manually update config if needed

-Database restores can fail if the target database isn't accessible or if there are permission issues. Check that your postgres or mysql pods are running and that you can connect to them manually.
+### App Won't Start After Restore

-Review the restore error messages carefully - they usually indicate whether the problem is with the backup file, database connectivity, or permissions.
-
-### PVC Restore Failures
-
-If PVC restoration fails, check that you have sufficient disk space and that the PVC isn't being used by other pods. The error messages will usually indicate what went wrong.
-
-Most importantly, remember that safety snapshots are preserved when PVC restores fail. You can see them with:
-```bash
-kubectl get snapshot.longhorn.io -n longhorn-system -l app=wild-app-restore
-```
-
-These snapshots let you recover to the pre-restore state if needed.
-
-### Application Won't Start After Restore
-
-If pods fail to start after restoration, check file permissions and ownership. Sometimes the restoration process doesn't perfectly preserve the exact permissions that the application expects.
-
-You can also try scaling the application to zero and back to one, which sometimes resolves transient issues:
-```bash
-kubectl scale deployment/discourse -n discourse --replicas=0
-kubectl scale deployment/discourse -n discourse --replicas=1
-```
-
-## Manual Recovery
-
-When automated restore fails, you can always fall back to manual extraction and restoration:
+Check file permissions and ownership in the restored PVCs. Try scaling to zero and back:

 ```bash
-# Extract backup files to local directory
-restic restore --tag discourse latest --target ./manual-restore/
-
-# Manually copy database dump to postgres pod
-kubectl cp ./manual-restore/discourse/database_*.dump \
-  postgres/postgres-deployment-xxx:/tmp/
-
-# Manually restore database
-kubectl exec -n postgres deploy/postgres-deployment -- \
-  pg_restore -U postgres -d discourse /tmp/database_*.dump
+kubectl scale deployment/gitea -n gitea-green --replicas=0
+kubectl scale deployment/gitea -n gitea-green --replicas=1
 ```

-For file restoration, you'd need to create a utility pod and manually copy files into the PVC.
+### No Backups Found
+
+```bash
+# List all backups for the app
+wild backup list gitea
+
+# Check backup destination is configured
+wild config get backup.destination
+```
+
+## Verifying a Successful Restore
+
+After any restore, verify:
+
+```bash
+# Check pods are running
+kubectl get pods -n gitea
+
+# Check logs for errors
+kubectl logs -n gitea deploy/gitea
+
+# Test database connectivity
+kubectl exec -n postgres deploy/postgres -- \
+  psql -U postgres -d gitea -c "SELECT 1;"
+
+# Test web access
+curl -f https://gitea.example.com/
+```

 ## Best Practices

-Test your restore procedures regularly in a non-production environment. It's much better to discover issues with your backup system during a planned test than during an actual emergency.
+- **Test restores regularly** in a test environment. Backups are worthless if restore doesn't work.
+- **Use step-by-step restore** for production apps so you can verify before switching traffic.
+- **Monitor after restore** — watch the app more closely than usual for a few days.
+- **Communicate with users** before performing restores that involve downtime.

-Always communicate with users before performing restores, especially if they involve downtime. Document any manual steps you had to take so you can improve the automated process.
+## Next Steps

-After any significant restore, monitor your applications more closely than usual for a few days. Sometimes problems don't surface immediately.
-
-## Security and Access Control
-
-Restore operations are powerful and can be destructive. Make sure only trusted administrators can perform restores, and consider requiring approval or coordination before major restoration operations.
-
-Be aware that cluster restores include all secrets, so they potentially expose passwords, API keys, and certificates. Ensure your backup repository is properly secured.
-
-Remember that Longhorn safety snapshots are preserved when things go wrong. These snapshots may contain sensitive data, so clean them up appropriately once you've resolved any issues.
-
-## What's Next
-
-The best way to get comfortable with restore operations is to practice them in a safe environment. Set up a test cluster and practice restoring applications and data.
-
-Consider creating runbooks for your most likely disaster scenarios, including the specific commands and verification steps for your infrastructure.
-
-Read the [Making Backups](making-backups.md) guide to ensure you're creating the backups you'll need for successful recovery.
+- [Making Backups](making-backups.md) — Set up backup schedules and destinations
+- [Disaster Recovery](disaster-recovery.md) — Full cluster rebuild from backups
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Paul Payne	c8c60fb11b	Nav state.	2026-05-27 07:03:04 +00:00
Paul Payne	c1ddf46f44	Restore strategies.	2026-05-25 23:09:39 +00:00
Paul Payne	a533082388	Improve Directory pages.	2026-05-25 22:29:47 +00:00
Paul Payne	658c7ab24c	Re-org nav.	2026-05-25 22:01:55 +00:00
Paul Payne	288b448e48	Remove expensive asset-hashing operation.	2026-05-25 22:01:43 +00:00
Paul Payne	ce5ca426d6	Normalize talos and kubeconfig paths.	2026-05-25 22:01:20 +00:00
Paul Payne	fa59b5d8ad	Fix flaky test.	2026-05-25 21:59:24 +00:00
Paul Payne	60fc76e9a4	Clean up docs.	2026-05-25 21:58:39 +00:00
Paul Payne	d38ed94d12	Node addition improvements. Global and instance config merging. Gomplate IPC.	2026-05-25 20:55:07 +00:00
Paul Payne	e2144412ce	SSE node discovery. Node reset. Node apply fix.	2026-05-25 18:37:30 +00:00
Paul Payne	e93a14aa92	More informative error logs.	2026-05-25 18:35:05 +00:00
Paul Payne	374bcb3bd0	Node UI breakout.	2026-05-25 07:52:41 +00:00
Paul Payne	e82c92b72e	Node health monitoring.	2026-05-25 07:35:53 +00:00
Paul Payne	270fbeabef	Adds node reboot.	2026-05-25 07:26:29 +00:00
Paul Payne	fdab9484a6	feat: Add cluster config backup and move schedules to per-app backup pages Cluster config backup archives kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos node configs for disaster recovery. Appears as "Cluster Config" row on the backups page with its own detail page. Backup schedules are now shown on each app's individual backup page instead of the main backups overview, with active operations visible per-app for real-time feedback during backup/restore. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:54:46 +00:00
Paul Payne	322492a85f	fix: Resolve SSE test race condition by making client registration synchronous RegisterClient was async (channel-based), so Broadcast could be processed before the client was registered in the map, causing flaky test failures. Register directly under the mutex instead. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:54:13 +00:00
Paul Payne	3f97dce86a	docs: Update all guides to reflect current CLI, API, and web app Rewrote backup/restore guides to document current system (native pg_dump/Longhorn/tar.gz tools, blue-green restore, scheduling) and remove outdated restic references. Rewrote monitoring guide to replace K3s/Helm/Velero placeholders with actual capabilities. Filled in all four upgrade guides (Talos, Kubernetes, applications, Wild Cloud) that were previously TBD stubs. Expanded troubleshooting guides with correct namespaces, Wild Cloud CLI commands, and Talos-specific diagnostics. Added verification commands to cluster networking health checklist. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:54:11 +00:00
Paul Payne	11c875a513	fix: Resolve all golangci-lint errors across API codebase Handle unchecked errors (errcheck), fix nil-deref false positives (SA5011), suppress deprecated-but-functional API warnings (SA1019), remove unused code, and use fmt.Fprintf over WriteString(fmt.Sprintf(...)). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:52:59 +00:00
Paul Payne	e051e80601	fix: Resolve eslint errors across web UI Remove unused imports (Clock, Database) and dead code (formatUptime), replace `any` types with proper types (BackupResourceInfo, QueryClient, Record<string, unknown>), fix DeployedApp/App type incompatibility, and use const for module-level collections in SSE hook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-24 21:32:06 +00:00
Paul Payne	fd58c7b694	Linting.	2026-05-24 21:24:40 +00:00
Paul Payne	3e9aa153e2	Go format.	2026-05-24 20:54:13 +00:00
Paul Payne	7cad37db07	More logging.	2026-05-24 20:40:02 +00:00
Paul Payne	eff5246144	Add more resiliency to backups and operations. Use Longhorn CRDs instead of a janky tunnel.	2026-05-24 20:35:51 +00:00
Paul Payne	81604879dc	slog integration	2026-05-24 20:29:22 +00:00
Paul Payne	44c7cb6f72	Bakup UX.	2026-05-24 20:03:27 +00:00