Compare commits

...

13 Commits

Author SHA1 Message Date
Paul Payne
e82c92b72e Node health monitoring. 2026-05-25 07:35:53 +00:00
Paul Payne
270fbeabef Adds node reboot. 2026-05-25 07:26:29 +00:00
Paul Payne
fdab9484a6 feat: Add cluster config backup and move schedules to per-app backup pages
Cluster config backup archives kubeconfig, talosconfig, config.yaml,
secrets.yaml, and Talos node configs for disaster recovery. Appears as
"Cluster Config" row on the backups page with its own detail page.

Backup schedules are now shown on each app's individual backup page
instead of the main backups overview, with active operations visible
per-app for real-time feedback during backup/restore.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-24 21:54:46 +00:00
Paul Payne
322492a85f fix: Resolve SSE test race condition by making client registration synchronous
RegisterClient was async (channel-based), so Broadcast could be processed
before the client was registered in the map, causing flaky test failures.
Register directly under the mutex instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-24 21:54:13 +00:00
Paul Payne
3f97dce86a docs: Update all guides to reflect current CLI, API, and web app
Rewrote backup/restore guides to document current system (native
pg_dump/Longhorn/tar.gz tools, blue-green restore, scheduling) and
remove outdated restic references. Rewrote monitoring guide to replace
K3s/Helm/Velero placeholders with actual capabilities. Filled in all
four upgrade guides (Talos, Kubernetes, applications, Wild Cloud) that
were previously TBD stubs. Expanded troubleshooting guides with correct
namespaces, Wild Cloud CLI commands, and Talos-specific diagnostics.
Added verification commands to cluster networking health checklist.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-24 21:54:11 +00:00
Paul Payne
11c875a513 fix: Resolve all golangci-lint errors across API codebase
Handle unchecked errors (errcheck), fix nil-deref false positives (SA5011),
suppress deprecated-but-functional API warnings (SA1019), remove unused code,
and use fmt.Fprintf over WriteString(fmt.Sprintf(...)).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-24 21:52:59 +00:00
Paul Payne
e051e80601 fix: Resolve eslint errors across web UI
Remove unused imports (Clock, Database) and dead code (formatUptime),
replace `any` types with proper types (BackupResourceInfo, QueryClient,
Record<string, unknown>), fix DeployedApp/App type incompatibility, and
use const for module-level collections in SSE hook.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-24 21:32:06 +00:00
Paul Payne
fd58c7b694 Linting. 2026-05-24 21:24:40 +00:00
Paul Payne
3e9aa153e2 Go format. 2026-05-24 20:54:13 +00:00
Paul Payne
7cad37db07 More logging. 2026-05-24 20:40:02 +00:00
Paul Payne
eff5246144 Add more resiliency to backups and operations. Use Longhorn CRDs instead of a janky tunnel. 2026-05-24 20:35:51 +00:00
Paul Payne
81604879dc slog integration 2026-05-24 20:29:22 +00:00
Paul Payne
44c7cb6f72 Bakup UX. 2026-05-24 20:03:27 +00:00
116 changed files with 6695 additions and 2089 deletions

19
api/.air.toml Normal file
View File

@@ -0,0 +1,19 @@
root = "."
tmp_dir = "tmp"
[build]
bin = "./tmp/wildd"
cmd = "go build -o ./tmp/wildd ."
delay = 1000
exclude_dir = ["tmp", "build", "dist", "vendor"]
exclude_regex = ["_test.go$"]
include_ext = ["go", "yaml"]
kill_delay = "0s"
send_interrupt = true
stop_on_error = true
[log]
time = false
[misc]
clean_on_exit = true

3
api/.gitignore vendored
View File

@@ -22,3 +22,6 @@ __debug*
# Go workspace file
go.work
go.work.sum
# Air live-reload
tmp/

View File

@@ -29,8 +29,14 @@ build: ## Build the daemon binary
$(GOBUILD) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY_NAME) .
dev: ## Run the daemon in development mode with live reloading
@echo "Starting $(BINARY_NAME) in development mode..."
$(GOCMD) run .
@if command -v air >/dev/null 2>&1; then \
echo "Starting $(BINARY_NAME) in development mode with live reloading (air)..."; \
air; \
else \
echo "air not found. Install it for live reloading: go install github.com/air-verse/air@latest"; \
echo "Starting $(BINARY_NAME) in development mode without live reloading..."; \
$(GOCMD) run .; \
fi
test: ## Run tests
@echo "Running tests..."

View File

@@ -4,7 +4,7 @@ The Wild Central API is a lightweight service that runs on a local machine (e.g.
## Development
Start the development server:
Start the development server with live reloading:
```bash
make dev
@@ -12,6 +12,14 @@ make dev
The API will be available at `http://localhost:5055`.
`make dev` uses [air](https://github.com/air-verse/air) to automatically rebuild and restart the server when `.go` or `.yaml` files change. Install it with:
```bash
go install github.com/air-verse/air@latest
```
If `air` is not installed, `make dev` falls back to `go run .` (no live reloading).
### Environment Variables
- `WILD_API_DATA_DIR` - Directory for instance data (default: `/var/lib/wild-central`)

View File

@@ -2,7 +2,7 @@ package v1
import (
"fmt"
"log"
"log/slog"
"net/http"
"github.com/wild-cloud/wild-central/daemon/internal/operations"
@@ -38,7 +38,7 @@ func (api *API) StartAsyncOperation(
// Always recover from panics to prevent goroutine crashes from taking down the server
defer func() {
if r := recover(); r != nil {
log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
}
}()
@@ -71,7 +71,7 @@ func (api *API) StartAsyncOperationWithMessage(
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
}
}()
@@ -105,7 +105,7 @@ func (api *API) StartAsyncOperationWithBroadcaster(
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("[ERROR] Panic in async operation %s/%s: %v", operationType, target, r)
slog.Error("panic in async operation", "type", operationType, "target", target, "error", r)
_ = opsMgr.Update(instanceName, opID, "failed", fmt.Sprintf("Internal error: %v", r), 0)
}
}()

View File

@@ -3,7 +3,7 @@ package v1
import (
"encoding/json"
"fmt"
"log"
"log/slog"
"net/http"
"os"
"time"
@@ -25,18 +25,18 @@ import (
// API holds all dependencies for API handlers
type API struct {
dataDir string
appsDir string // Path to external apps directory
config *config.Manager
secrets *secrets.Manager
context *context.Manager
instance *instance.Manager
dnsmasq *dnsmasq.ConfigGenerator
opsMgr *operations.Manager // Operations manager
broadcaster *operations.Broadcaster // SSE broadcaster for operation output
sseManager *sse.Manager // SSE manager for real-time events
watcherManager *sse.WatcherManager // Manager for kubectl/talos watchers
factory *factory.Client // Talos Image Factory client
dataDir string
appsDir string // Path to external apps directory
config *config.Manager
secrets *secrets.Manager
context *context.Manager
instance *instance.Manager
dnsmasq *dnsmasq.ConfigGenerator
opsMgr *operations.Manager // Operations manager
broadcaster *operations.Broadcaster // SSE broadcaster for operation output
sseManager *sse.Manager // SSE manager for real-time events
watcherManager *sse.WatcherManager // Manager for kubectl/talos watchers
factory *factory.Client // Talos Image Factory client
}
// NewAPI creates a new API handler with all dependencies
@@ -59,7 +59,7 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
dnsmasqConfigPath := "/etc/dnsmasq.d/wild-cloud.conf"
if os.Getenv("WILD_API_DNSMASQ_CONFIG_PATH") != "" {
dnsmasqConfigPath = os.Getenv("WILD_API_DNSMASQ_CONFIG_PATH")
log.Printf("Using custom dnsmasq config path: %s", dnsmasqConfigPath)
slog.Info("using custom dnsmasq config path", "path", dnsmasqConfigPath)
}
// Create SSE manager for real-time events
@@ -73,24 +73,23 @@ func NewAPI(dataDir, appsDir string) (*API, error) {
opsMgr.SetSSEManager(adapter)
api := &API{
dataDir: dataDir,
appsDir: appsDir,
config: configMgr,
secrets: secrets.NewManager(),
context: context.NewManager(dataDir),
instance: instance.NewManager(dataDir),
dnsmasq: dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
opsMgr: opsMgr,
broadcaster: operations.NewBroadcaster(),
sseManager: sseManager,
watcherManager: watcherManager,
factory: factory.NewClient(),
dataDir: dataDir,
appsDir: appsDir,
config: configMgr,
secrets: secrets.NewManager(),
context: context.NewManager(dataDir),
instance: instance.NewManager(dataDir),
dnsmasq: dnsmasq.NewConfigGenerator(dnsmasqConfigPath),
opsMgr: opsMgr,
broadcaster: operations.NewBroadcaster(),
sseManager: sseManager,
watcherManager: watcherManager,
factory: factory.NewClient(),
}
return api, nil
}
// StartCentralStatusBroadcaster starts periodic broadcasting of central status
func (api *API) StartCentralStatusBroadcaster(startTime time.Time) {
go func() {
@@ -107,6 +106,8 @@ func (api *API) StartCentralStatusBroadcaster(startTime time.Time) {
}
func (api *API) RegisterRoutes(r *mux.Router) {
// Request logging middleware (runs first, wraps everything)
r.Use(RequestLoggingMiddleware)
// Apply instance validation middleware to all routes with {name} parameter
r.Use(api.ValidateInstanceMiddleware)
@@ -145,6 +146,8 @@ func (api *API) RegisterRoutes(r *mux.Router) {
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}", api.NodeGet).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}", api.NodeUpdate).Methods("PUT")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/apply", api.NodeApply).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/health", api.NodeHealth).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/reboot", api.NodeReboot).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/reset", api.NodeReset).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/upgrade", api.NodeUpgrade).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/nodes/{node}/rollback", api.NodeRollback).Methods("POST")
@@ -229,6 +232,11 @@ func (api *API) RegisterRoutes(r *mux.Router) {
r.HandleFunc("/api/v1/instances/{name}/apps/{app}/restore/cleanup", api.BackupAppCleanup).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/apps/{app}/restore/plan", api.BackupAppRecoveryPlan).Methods("GET")
// Backup & Restore - Cluster Config
r.HandleFunc("/api/v1/instances/{name}/backup/cluster", api.BackupClusterStart).Methods("POST")
r.HandleFunc("/api/v1/instances/{name}/backup/cluster", api.BackupClusterList).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/backup/cluster/{timestamp}", api.BackupClusterDelete).Methods("DELETE")
// Backup Schedules
r.HandleFunc("/api/v1/instances/{name}/backup/schedules", api.BackupScheduleList).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/backup/schedules", api.BackupScheduleCreate).Methods("POST")
@@ -236,8 +244,10 @@ func (api *API) RegisterRoutes(r *mux.Router) {
r.HandleFunc("/api/v1/instances/{name}/backup/schedules/{scheduleId}", api.BackupScheduleDelete).Methods("DELETE")
r.HandleFunc("/api/v1/instances/{name}/backup/schedules/{scheduleId}/run", api.BackupScheduleRun).Methods("POST")
// Backup Health
// Backup Health & Configuration
r.HandleFunc("/api/v1/instances/{name}/backup/health", api.BackupHealth).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/backup/config", api.BackupConfigGet).Methods("GET")
r.HandleFunc("/api/v1/instances/{name}/backup/config", api.BackupConfigUpdate).Methods("PUT")
// Global Configuration
r.HandleFunc("/api/v1/config", api.GetGlobalConfig).Methods("GET")
@@ -299,7 +309,7 @@ func (api *API) CreateInstance(w http.ResponseWriter, r *http.Request) {
}
if err := api.updateDnsmasqForAllInstances(); err != nil {
log.Printf("Warning: Could not update dnsmasq configuration: %v", err)
slog.Error("dnsmasq config update failed", "error", err)
response["warning"] = fmt.Sprintf("dnsmasq update failed: %v. Use POST /api/v1/dnsmasq/update to retry.", err)
}
@@ -387,7 +397,7 @@ func (api *API) GetConfig(w http.ResponseWriter, r *http.Request) {
// Return raw YAML
w.Header().Set("Content-Type", "application/yaml")
w.WriteHeader(http.StatusOK)
w.Write(configData)
_, _ = w.Write(configData)
return
}

View File

@@ -385,7 +385,7 @@ func (api *API) AppsGetReadme(w http.ResponseWriter, r *http.Request) {
content, err := os.ReadFile(instancePath)
if err == nil {
w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
w.Write(content)
_, _ = w.Write(content)
return
}
@@ -402,7 +402,7 @@ func (api *API) AppsGetReadme(w http.ResponseWriter, r *http.Request) {
}
w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
w.Write(content)
_, _ = w.Write(content)
}
// AppsGetManifest returns the manifest for an available app
@@ -440,7 +440,7 @@ func (api *API) AppsGetAvailableReadme(w http.ResponseWriter, r *http.Request) {
}
w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
w.Write(content)
_, _ = w.Write(content)
}
// AppsCompile recompiles an app's templates
@@ -487,7 +487,7 @@ func (api *API) AppsGetManifests(w http.ResponseWriter, r *http.Request) {
var manifest apps.AppManifest
manifestPath := filepath.Join(appDir, "manifest.yaml")
if data, err := os.ReadFile(manifestPath); err == nil {
yaml.Unmarshal(data, &manifest)
_ = yaml.Unmarshal(data, &manifest)
}
// Build list of kustomize directories to render
@@ -523,7 +523,7 @@ func (api *API) AppsGetManifests(w http.ResponseWriter, r *http.Request) {
}
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.Write(allOutput)
_, _ = w.Write(allOutput)
}
// AppsRunScript runs a named script defined in the app's manifest

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"os/exec"
@@ -370,7 +371,7 @@ func (api *API) BackupAppDelete(w http.ResponseWriter, r *http.Request) {
"app": appName,
},
})
respondError(w, http.StatusInternalServerError, "Failed to delete backup")
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete backup: %v", err))
return
}
@@ -454,12 +455,12 @@ func (api *API) BackupAppVerify(w http.ResponseWriter, r *http.Request) {
// BackupResourceInfo contains information about a discovered backup resource
type BackupResourceInfo struct {
Name string `json:"name"`
Type string `json:"type"` // "database", "pvc", "secret"
Plugin string `json:"plugin"` // "postgres", "mysql", "longhorn-pvc", etc.
Name string `json:"name"`
Type string `json:"type"` // "database", "pvc", "secret"
Plugin string `json:"plugin"` // "postgres", "mysql", "longhorn-pvc", etc.
Source map[string]any `json:"source"` // Resource-specific info
ShouldBackup bool `json:"shouldBackup"`
Reason string `json:"reason,omitempty"` // Why it's included/excluded
ShouldBackup bool `json:"shouldBackup"`
Reason string `json:"reason,omitempty"` // Why it's included/excluded
}
// BackupAppDiscoverResources auto-discovers backup resources for an app
@@ -590,9 +591,9 @@ func parsePVC(pvc map[string]any) BackupResourceInfo {
}
return BackupResourceInfo{
Name: name,
Type: "pvc",
Plugin: plugin,
Name: name,
Type: "pvc",
Plugin: plugin,
Source: map[string]any{
"pvcName": name,
"storageClass": storageClass,
@@ -635,9 +636,9 @@ func parseVolumeClaimTemplate(vct map[string]any, statefulSetName string) Backup
}
return BackupResourceInfo{
Name: pvcName,
Type: "pvc",
Plugin: detectStoragePlugin(storageClass),
Name: pvcName,
Type: "pvc",
Plugin: detectStoragePlugin(storageClass),
Source: map[string]any{
"pvcName": pvcName,
"storageClass": storageClass,
@@ -684,7 +685,7 @@ func discoverDatabases(dataDir, instanceName, appName, manifestPath string) []Ba
configPath := tools.GetInstanceConfigPath(dataDir, instanceName)
configData, _ := os.ReadFile(configPath)
var config map[string]any
yaml.Unmarshal(configData, &config)
_ = yaml.Unmarshal(configData, &config)
appConfig := map[string]any{}
if apps, ok := config["apps"].(map[string]any); ok {
@@ -998,13 +999,18 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
return
}
if sched.TargetType != "app" {
respondError(w, http.StatusBadRequest, "Only app schedules can be triggered manually")
if sched.TargetType != "app" && sched.TargetType != "cluster" {
respondError(w, http.StatusBadRequest, "Unsupported schedule target type")
return
}
opTarget := sched.TargetName
if sched.TargetType == "cluster" {
opTarget = "_cluster"
}
// Run as async operation
api.StartAsyncOperation(w, instanceName, "backup", sched.TargetName,
api.StartAsyncOperation(w, instanceName, "backup", opTarget,
func(opsMgr *operations.Manager, opID string) error {
_ = opsMgr.UpdateProgress(instanceName, opID, 10, "Starting scheduled backup")
@@ -1013,7 +1019,13 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
}
mgr := backup.NewManagerWithProgress(api.dataDir, progressCallback)
_, err := mgr.BackupApp(instanceName, sched.TargetName)
var err error
if sched.TargetType == "cluster" {
_, err = mgr.BackupClusterConfig(instanceName)
} else {
_, err = mgr.BackupApp(instanceName, sched.TargetName)
}
if err == nil {
// Update lastRun and nextRun
@@ -1021,26 +1033,28 @@ func (api *API) BackupScheduleRun(w http.ResponseWriter, r *http.Request) {
sched.LastRun = &now
next := backup.ComputeNextRun(sched, now)
sched.NextRun = &next
backup.SaveInstanceBackupSchedules(api.dataDir, instanceName, config.Schedules)
if err := backup.SaveInstanceBackupSchedules(api.dataDir, instanceName, config.Schedules); err != nil {
slog.Error("failed to save backup schedules", "instance", instanceName, "error", err)
}
api.sseManager.Broadcast(&sse.Event{
Type: "backup:schedule:completed",
InstanceName: instanceName,
Data: map[string]any{
"scheduleId": scheduleID,
"app": sched.TargetName,
"target": opTarget,
},
})
// Enforce retention using schedule's policy
// Enforce retention
keepLast, keepDays := backup.RetentionFromSchedule(sched, config.Retention)
deleted, retErr := backup.EnforceRetention(mgr, instanceName, sched.TargetName, keepLast, keepDays)
deleted, retErr := backup.EnforceRetention(mgr, instanceName, opTarget, keepLast, keepDays)
if retErr == nil && deleted > 0 {
api.sseManager.Broadcast(&sse.Event{
Type: "backup:retention:completed",
InstanceName: instanceName,
Data: map[string]any{
"app": sched.TargetName,
"target": opTarget,
"deleted": deleted,
},
})
@@ -1063,6 +1077,9 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
mgr := backup.NewManager(api.dataDir)
// Compute default retention limit
defaultKeepLast, _ := backup.DefaultRetention(config.Retention)
// Get all apps with backups by scanning the backup directory
backupDir := mgr.GetBackupDir(instanceName)
appHealth := make(map[string]any)
@@ -1079,8 +1096,38 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
continue
}
// Compute total size across all backups for this app
var totalSize int64
for _, p := range plans {
for _, s := range p.Strategies {
if s.Backup != nil {
if size, ok := s.Backup["size"]; ok {
switch v := size.(type) {
case int64:
totalSize += v
case int:
totalSize += int64(v)
case float64:
totalSize += int64(v)
}
}
}
}
}
// Determine retention limit for this app (schedule override or default)
keepLast := defaultKeepLast
for _, sched := range config.Schedules {
if sched.TargetName == appName && sched.Enabled && sched.Retention != nil && sched.Retention.KeepLast > 0 {
keepLast = sched.Retention.KeepLast
break
}
}
info := map[string]any{
"backupCount": len(plans),
"retainCount": keepLast,
"totalSize": totalSize,
"scheduled": false,
}
@@ -1088,6 +1135,9 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
newest := plans[0]
info["lastBackup"] = newest.Timestamp
info["lastStatus"] = newest.Status
if newest.Version != "" {
info["lastVersion"] = newest.Version
}
}
// Check if this app has an active schedule
@@ -1135,3 +1185,180 @@ func (api *API) BackupHealth(w http.ResponseWriter, r *http.Request) {
})
}
// BackupConfigGet returns the current backup configuration (destination + retention)
func (api *API) BackupConfigGet(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
config, err := backup.LoadInstanceBackupConfig(api.dataDir, instanceName)
if err != nil {
respondError(w, http.StatusInternalServerError, "Failed to load backup config")
return
}
respondJSON(w, http.StatusOK, map[string]any{
"success": true,
"data": map[string]any{
"destination": config.Destination,
"retention": config.Retention,
"verification": config.Verification,
},
})
}
// BackupConfigUpdate updates the backup destination and/or retention settings
func (api *API) BackupConfigUpdate(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
var req struct {
Destination *backup.DestinationConfig `json:"destination"`
Retention *backup.RetentionPolicy `json:"retention"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
respondError(w, http.StatusBadRequest, "Invalid request body")
return
}
if req.Destination == nil && req.Retention == nil {
respondError(w, http.StatusBadRequest, "Must provide destination or retention to update")
return
}
// Validate destination type if provided
if req.Destination != nil {
switch req.Destination.Type {
case "local", "nfs", "s3", "azure":
// valid
default:
respondError(w, http.StatusBadRequest, fmt.Sprintf("Invalid destination type: %s", req.Destination.Type))
return
}
}
if err := backup.SaveInstanceBackupConfig(api.dataDir, instanceName, req.Destination, req.Retention); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to save backup config: %v", err))
return
}
respondJSON(w, http.StatusOK, map[string]any{
"success": true,
"message": "Backup configuration updated",
})
}
// BackupClusterStart starts a cluster config backup operation
func (api *API) BackupClusterStart(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
api.sseManager.Broadcast(&sse.Event{
Type: "backup:started",
InstanceName: instanceName,
Data: map[string]any{
"app": "_cluster",
},
})
api.StartAsyncOperation(w, instanceName, "backup", "_cluster",
func(opsMgr *operations.Manager, opID string) error {
_ = opsMgr.UpdateProgress(instanceName, opID, 10, "Starting cluster config backup")
progressCallback := func(progress int, message string) {
_ = opsMgr.UpdateProgress(instanceName, opID, progress, message)
}
mgr := backup.NewManagerWithProgress(api.dataDir, progressCallback)
_, err := mgr.BackupClusterConfig(instanceName)
if err != nil {
api.sseManager.Broadcast(&sse.Event{
Type: "backup:failed",
InstanceName: instanceName,
Data: map[string]any{
"app": "_cluster",
"error": err.Error(),
},
})
} else {
api.sseManager.Broadcast(&sse.Event{
Type: "backup:completed",
InstanceName: instanceName,
Data: map[string]any{
"app": "_cluster",
},
})
// Enforce retention after successful backup
config, configErr := backup.LoadInstanceBackupConfig(api.dataDir, instanceName)
if configErr == nil {
keepLast, keepDays := backup.DefaultRetention(config.Retention)
deleted, retErr := backup.EnforceRetention(mgr, instanceName, "_cluster", keepLast, keepDays)
if retErr == nil && deleted > 0 {
api.sseManager.Broadcast(&sse.Event{
Type: "backup:retention:completed",
InstanceName: instanceName,
Data: map[string]any{
"target": "_cluster",
"deleted": deleted,
},
})
}
}
}
return err
})
}
// BackupClusterList lists all cluster config backups
func (api *API) BackupClusterList(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
mgr := backup.NewManager(api.dataDir)
backups, err := mgr.ListBackups(instanceName, "_cluster")
if err != nil {
respondError(w, http.StatusInternalServerError, "Failed to list cluster backups")
return
}
respondJSON(w, http.StatusOK, map[string]any{
"success": true,
"data": map[string]any{
"backups": backups,
},
})
}
// BackupClusterDelete deletes a specific cluster config backup
func (api *API) BackupClusterDelete(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
timestamp := mux.Vars(r)["timestamp"]
mgr := backup.NewManager(api.dataDir)
if err := mgr.DeleteAppBackup(instanceName, "_cluster", timestamp); err != nil {
api.sseManager.Broadcast(&sse.Event{
Type: "backup:delete:failed",
InstanceName: instanceName,
Data: map[string]any{
"app": "_cluster",
"timestamp": timestamp,
"error": err.Error(),
},
})
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to delete backup: %v", err))
return
}
api.sseManager.Broadcast(&sse.Event{
Type: "backup:deleted",
InstanceName: instanceName,
Data: map[string]any{
"app": "_cluster",
"timestamp": timestamp,
},
})
respondJSON(w, http.StatusOK, map[string]any{
"success": true,
"message": "Cluster config backup deleted successfully",
})
}

View File

@@ -740,10 +740,10 @@ func TestIsDatabase(t *testing.T) {
// It verifies that we can find PVCs and StatefulSet volume claims in Kubernetes manifests
func TestDiscoverFromKustomize(t *testing.T) {
tests := []struct {
name string
kustomizeYAML string
expectedCount int
expectedFirst BackupResourceInfo
name string
kustomizeYAML string
expectedCount int
expectedFirst BackupResourceInfo
}{
{
name: "Discovers PVC as persistent state",
@@ -1115,4 +1115,4 @@ func TestBackupAppOperations(t *testing.T) {
assert.Equal(t, "Backup deleted successfully", response["message"])
}
})
}
}

View File

@@ -3,6 +3,7 @@ package v1
import (
"encoding/json"
"fmt"
"log/slog"
"net/http"
"github.com/wild-cloud/wild-central/daemon/internal/config"
@@ -43,6 +44,8 @@ func (api *API) ConfigUpdateBatch(w http.ResponseWriter, r *http.Request) {
updateCount++
}
slog.Info("config batch updated", "instance", instanceName, "keys", updateCount)
respondJSON(w, http.StatusOK, map[string]interface{}{
"message": "Configuration updated successfully",
"updated": updateCount,
@@ -87,6 +90,8 @@ func (api *API) UpdateGlobalConfig(w http.ResponseWriter, r *http.Request) {
return
}
slog.Info("global config updated")
respondJSON(w, http.StatusOK, map[string]interface{}{
"message": "Global configuration updated successfully",
"config": globalCfg,

View File

@@ -3,7 +3,7 @@ package v1
import (
"encoding/json"
"fmt"
"log"
"log/slog"
"net/http"
"os"
@@ -79,7 +79,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
instanceConfigPath := api.instance.GetInstanceConfigPath(name)
instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
if err != nil {
log.Printf("Warning: Could not load instance config for %s: %v", name, err)
slog.Error("failed to load instance config", "instance", name, "error", err)
continue
}
instanceConfigs = append(instanceConfigs, *instanceCfg)
@@ -95,7 +95,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
isFirstStart := err != nil || status.Status != "active"
// Update main dnsmasq configuration
log.Printf("Updating dnsmasq main configuration...")
slog.Info("updating dnsmasq main configuration")
// Write the main config
tempFile := api.dnsmasq.GetConfigPath() + ".tmp"
@@ -121,7 +121,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
// Write all instance configs
for i, name := range validInstanceNames {
if err := api.dnsmasq.WriteInstanceConfig(name, instanceConfigs[i]); err != nil {
log.Printf("Warning: Failed to write instance config for %s: %v", name, err)
slog.Error("failed to write instance DNS config", "instance", name, "error", err)
}
}
@@ -134,7 +134,7 @@ func (api *API) DnsmasqGenerate(w http.ResponseWriter, r *http.Request) {
// Configure system DNS to use local dnsmasq on first start
if isFirstStart {
if err := api.dnsmasq.ConfigureSystemDNS(); err != nil {
log.Printf("Warning: Failed to configure system DNS: %v", err)
slog.Error("failed to configure system DNS", "error", err)
// Don't fail the request - dnsmasq is still running
}
}
@@ -211,16 +211,14 @@ func (api *API) updateDnsmasqForAllInstances() error {
// Load all instance configs
var instanceConfigs []config.InstanceConfig
var validInstanceNames []string
for _, name := range instanceNames {
instanceConfigPath := api.instance.GetInstanceConfigPath(name)
instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
if err != nil {
log.Printf("Warning: Could not load instance config for %s: %v", name, err)
slog.Error("failed to load instance config", "instance", name, "error", err)
continue
}
instanceConfigs = append(instanceConfigs, *instanceCfg)
validInstanceNames = append(validInstanceNames, name)
}
// Regenerate and write dnsmasq config with restart

View File

@@ -42,7 +42,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
globalConfig.Cloud.Router.IP = "192.168.1.1"
configPath := filepath.Join(tmpDir, "config.yaml")
configData, _ := yaml.Marshal(globalConfig)
storage.WriteFile(configPath, configData, 0644)
if err := storage.WriteFile(configPath, configData, 0644); err != nil {
t.Fatal(err)
}
// Create test instance
instanceName := "test-instance"
@@ -54,7 +56,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
instanceConfig.Cloud.InternalDomain = "internal.test.local"
instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
instanceConfigData, _ := yaml.Marshal(instanceConfig)
storage.WriteFile(instanceConfigPath, instanceConfigData, 0644)
if err := storage.WriteFile(instanceConfigPath, instanceConfigData, 0644); err != nil {
t.Fatal(err)
}
// Test generate without overwrite
req := httptest.NewRequest("POST", "/api/v1/dnsmasq/generate", nil)
@@ -67,7 +71,9 @@ func TestDnsmasqGenerate_WithoutOverwrite(t *testing.T) {
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
// Verify response contains config
if config, ok := resp["config"].(string); !ok || config == "" {
@@ -90,7 +96,9 @@ func TestDnsmasqGenerate_WithOverwrite(t *testing.T) {
globalConfig.Cloud.Router.IP = "192.168.1.1"
configPath := filepath.Join(tmpDir, "config.yaml")
configData, _ := yaml.Marshal(globalConfig)
storage.WriteFile(configPath, configData, 0644)
if err := storage.WriteFile(configPath, configData, 0644); err != nil {
t.Fatal(err)
}
// Create test instance
instanceName := "test-instance"
@@ -103,7 +111,9 @@ func TestDnsmasqGenerate_WithOverwrite(t *testing.T) {
instanceConfig.Cluster.LoadBalancerIp = "192.168.1.80"
instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
instanceConfigData, _ := yaml.Marshal(instanceConfig)
storage.WriteFile(instanceConfigPath, instanceConfigData, 0644)
if err := storage.WriteFile(instanceConfigPath, instanceConfigData, 0644); err != nil {
t.Fatal(err)
}
// Instead of calling the handler which would try to restart the service,
// directly test the UpdateConfig method with restart=false
@@ -201,8 +211,12 @@ func TestDnsmasqGetConfig(t *testing.T) {
// Write a config first
configPath := api.dnsmasq.GetConfigPath()
testConfig := "# Test config\ninterface=eth0\n"
os.MkdirAll(filepath.Dir(configPath), 0755)
os.WriteFile(configPath, []byte(testConfig), 0644)
if err := os.MkdirAll(filepath.Dir(configPath), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(configPath, []byte(testConfig), 0644); err != nil {
t.Fatal(err)
}
req := httptest.NewRequest("GET", "/api/v1/dnsmasq/config", nil)
w := httptest.NewRecorder()
@@ -214,7 +228,9 @@ func TestDnsmasqGetConfig(t *testing.T) {
}
var resp map[string]interface{}
json.Unmarshal(w.Body.Bytes(), &resp)
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
content, ok := resp["content"].(string)
if !ok || content != testConfig {

View File

@@ -309,6 +309,38 @@ func (api *API) NodeDiscoveryCancel(w http.ResponseWriter, r *http.Request) {
})
}
// NodeReboot reboots a node without wiping state
func (api *API) NodeReboot(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
nodeIdentifier := GetNodeName(r)
nodeMgr := node.NewManager(api.dataDir, instanceName)
if err := nodeMgr.Reboot(instanceName, nodeIdentifier); err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to reboot node: %v", err))
return
}
respondJSON(w, http.StatusOK, map[string]string{
"message": "Node reboot initiated",
"node": nodeIdentifier,
})
}
// NodeHealth checks node health via Talos service statuses and dmesg
func (api *API) NodeHealth(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
nodeIdentifier := GetNodeName(r)
nodeMgr := node.NewManager(api.dataDir, instanceName)
health, err := nodeMgr.Health(instanceName, nodeIdentifier)
if err != nil {
respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to check node health: %v", err))
return
}
respondJSON(w, http.StatusOK, health)
}
// NodeReset resets a node to maintenance mode
func (api *API) NodeReset(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)

View File

@@ -3,7 +3,7 @@ package v1
import (
"encoding/json"
"fmt"
"log"
"log/slog"
"net/http"
"github.com/gorilla/mux"
@@ -18,7 +18,7 @@ func (api *API) PXEListAssets(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use GET /api/v1/assets/{schematicId} instead.")
log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets called", instanceName)
slog.Info("deprecated endpoint called", "endpoint", "pxe/assets", "instance", instanceName)
// Get schematic ID from instance config
configPath := api.instance.GetInstanceConfigPath(instanceName)
@@ -49,7 +49,7 @@ func (api *API) PXEDownloadAsset(w http.ResponseWriter, r *http.Request) {
instanceName := GetInstanceName(r)
w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use POST /api/v1/assets/{schematicId}/download instead.")
log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets/download called", instanceName)
slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/download", "instance", instanceName)
// Parse request
var req struct {
@@ -123,7 +123,7 @@ func (api *API) PXEGetAsset(w http.ResponseWriter, r *http.Request) {
assetType := mux.Vars(r)["type"]
w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use GET /api/v1/assets/{schematicId}/pxe/{assetType} instead.")
log.Printf("Warning: Deprecated endpoint /api/v1/instances/%s/pxe/assets/%s called", instanceName, assetType)
slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/get", "instance", instanceName, "assetType", assetType)
// Get schematic ID from instance config
configPath := api.instance.GetInstanceConfigPath(instanceName)
@@ -162,7 +162,7 @@ func (api *API) PXEDeleteAsset(w http.ResponseWriter, r *http.Request) {
assetType := mux.Vars(r)["type"]
w.Header().Set("X-Deprecated", "This endpoint is deprecated. Use DELETE /api/v1/assets/{schematicId} instead.")
log.Printf("Warning: Deprecated endpoint DELETE /api/v1/instances/%s/pxe/assets/%s called", instanceName, assetType)
slog.Info("deprecated endpoint called", "endpoint", "pxe/assets/delete", "instance", instanceName, "assetType", assetType)
// Get schematic ID from instance config
configPath := api.instance.GetInstanceConfigPath(instanceName)

View File

@@ -3,7 +3,7 @@ package v1
import (
"encoding/json"
"fmt"
"log"
"log/slog"
"net/http"
"strings"
"time"
@@ -52,13 +52,12 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
if err != nil {
// Default to empty string if not found - talos events will be skipped
nodeIP = ""
log.Printf("Control plane VIP not found for instance %s, Talos events will be disabled", instanceName)
slog.Info("control plane VIP not found, Talos events disabled", "instance", instanceName)
}
// Start watchers for this instance if not already running
if err := api.watcherManager.StartWatchers(instanceName, kubeconfigPath, talosconfigPath, nodeIP); err != nil {
log.Printf("Failed to start watchers for instance %s: %v", instanceName, err)
// Continue anyway - client might still receive events from other sources
slog.Error("failed to start watchers", "instance", instanceName, "error", err)
}
// 7. Send initial connected event
@@ -71,7 +70,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
},
}
if err := sendSSEEvent(w, connectedEvent); err != nil {
log.Printf("Failed to send connected event: %v", err)
slog.Error("failed to send SSE connected event", "error", err)
return
}
@@ -98,7 +97,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
case event := <-client.Channel:
// Send event to client
if err := sendSSEEvent(w, event); err != nil {
log.Printf("Failed to send event: %v", err)
slog.Error("failed to send SSE event", "error", err)
return
}
@@ -117,7 +116,7 @@ func (api *API) InstanceEventStream(w http.ResponseWriter, r *http.Request) {
},
}
if err := sendSSEEvent(w, heartbeatEvent); err != nil {
log.Printf("Failed to send heartbeat: %v", err)
slog.Error("failed to send SSE heartbeat", "error", err)
return
}
@@ -190,7 +189,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
},
}
if err := sendSSEEvent(w, connectedEvent); err != nil {
log.Printf("Failed to send connected event: %v", err)
slog.Error("failed to send SSE connected event", "error", err)
return
}
@@ -217,7 +216,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
case event := <-client.Channel:
// Send event to client
if err := sendSSEEvent(w, event); err != nil {
log.Printf("Failed to send event: %v", err)
slog.Error("failed to send SSE event", "error", err)
return
}
@@ -236,7 +235,7 @@ func (api *API) GlobalEventStream(w http.ResponseWriter, r *http.Request) {
},
}
if err := sendSSEEvent(w, heartbeatEvent); err != nil {
log.Printf("Failed to send heartbeat: %v", err)
slog.Error("failed to send SSE heartbeat", "error", err)
return
}
@@ -263,4 +262,4 @@ func parseQueryList(param string) []string {
}
}
return result
}
}

View File

@@ -63,11 +63,11 @@ func (api *API) TerminalWebSocket(w http.ResponseWriter, r *http.Request) {
ptmx, err := pty.Start(cmd)
if err != nil {
conn.WriteMessage(websocket.TextMessage, []byte("Failed to start shell: "+err.Error()))
_ = conn.WriteMessage(websocket.TextMessage, []byte("Failed to start shell: "+err.Error()))
return
}
defer ptmx.Close()
defer cmd.Process.Kill()
defer func() { _ = cmd.Process.Kill() }()
// Channel to signal when to stop
done := make(chan struct{})
@@ -103,7 +103,7 @@ func (api *API) TerminalWebSocket(w http.ResponseWriter, r *http.Request) {
var resize terminalResize
if err := json.Unmarshal(msg, &resize); err == nil && resize.Type == "resize" {
if resize.Cols > 0 && resize.Rows > 0 {
pty.Setsize(ptmx, &pty.Winsize{
_ = pty.Setsize(ptmx, &pty.Winsize{
Cols: uint16(resize.Cols),
Rows: uint16(resize.Rows),
})

View File

@@ -3,7 +3,7 @@ package v1
import (
"fmt"
"io"
"log"
"log/slog"
"net/http"
"os"
"strings"
@@ -15,28 +15,6 @@ import (
"gopkg.in/yaml.v3"
)
// getNestedValue retrieves a value from a nested map using dot notation path.
// For example, getNestedValue(data, "cluster.nodes.active") returns data["cluster"]["nodes"]["active"].
func getNestedValue(data map[string]interface{}, path string) interface{} {
keys := strings.Split(path, ".")
current := data
for i, key := range keys {
if i == len(keys)-1 {
return current[key]
}
if next, ok := current[key].(map[string]interface{}); ok {
current = next
} else {
return nil
}
}
return nil
}
// updateYAMLFile updates a YAML file with the provided key-value pairs.
// It performs a shallow merge at the top level, preserving unmodified keys.
func (api *API) updateYAMLFile(w http.ResponseWriter, r *http.Request, instanceName, fileType string) {
@@ -119,26 +97,26 @@ func (api *API) updateYAMLFile(w http.ResponseWriter, r *http.Request, instanceN
return
}
slog.Info(fileType+" updated", "instance", instanceName)
// Update DNS if domains changed
if domainsChanged && fileType == "config" {
go func() {
log.Printf("Domain change detected for instance %s, updating DNS configuration...", instanceName)
slog.Info("domain change detected, updating DNS", "instance", instanceName)
// Load the full instance config
instanceConfigPath := api.instance.GetInstanceConfigPath(instanceName)
instanceCfg, err := config.LoadCloudConfig(instanceConfigPath)
if err != nil {
log.Printf("Failed to load instance config for DNS update: %v", err)
slog.Error("failed to load instance config for DNS update", "instance", instanceName, "error", err)
return
}
// Update the DNS configuration for this instance
if err := api.dnsmasq.UpdateInstanceDNS(instanceName, *instanceCfg); err != nil {
log.Printf("Failed to update DNS for instance %s: %v", instanceName, err)
slog.Error("failed to update DNS", "instance", instanceName, "error", err)
return
}
log.Printf("Successfully updated DNS configuration for instance %s", instanceName)
slog.Info("DNS configuration updated", "instance", instanceName)
}()
}

View File

@@ -2,11 +2,68 @@ package v1
import (
"context"
"log/slog"
"net/http"
"strings"
"time"
"github.com/gorilla/mux"
)
// statusResponseWriter wraps http.ResponseWriter to capture the status code.
type statusResponseWriter struct {
http.ResponseWriter
status int
}
func (w *statusResponseWriter) WriteHeader(code int) {
w.status = code
w.ResponseWriter.WriteHeader(code)
}
// RequestLoggingMiddleware logs method, path, status, and duration for each request.
// Long-lived connections (SSE, WebSocket) are excluded.
func RequestLoggingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
path := r.URL.Path
// Skip SSE and WebSocket endpoints (long-lived connections)
if strings.HasSuffix(path, "/events") || strings.HasSuffix(path, "/ws") || strings.HasSuffix(path, "/stream") {
next.ServeHTTP(w, r)
return
}
start := time.Now()
sw := &statusResponseWriter{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(sw, r)
attrs := []any{
"status", sw.status,
"method", r.Method,
"path", path,
"duration", time.Since(start),
}
// Add route params if present
vars := mux.Vars(r)
if name := vars["name"]; name != "" {
attrs = append(attrs, "instance", name)
}
if app := vars["app"]; app != "" {
attrs = append(attrs, "app", app)
}
if node := vars["node"]; node != "" {
attrs = append(attrs, "node", node)
}
if sw.status >= 400 {
slog.Error("request", attrs...)
} else {
slog.Info("request", attrs...)
}
})
}
// contextKey is a type for context keys to avoid collisions.
type contextKey string

View File

@@ -75,15 +75,15 @@ type RestoreRequest struct {
// ScheduleCreateRequest is the request body for creating a backup schedule.
type ScheduleCreateRequest struct {
Name string `json:"name"`
TargetType string `json:"target_type"` // "app" or "cluster"
TargetName string `json:"target_name"`
Frequency string `json:"frequency"` // "daily", "weekly", "monthly"
Time string `json:"time"` // "HH:MM"
DayOfWeek int `json:"day_of_week,omitempty"`
DayOfMonth int `json:"day_of_month,omitempty"`
Retention *ScheduleRetentionReq `json:"retention,omitempty"`
Enabled bool `json:"enabled"`
Name string `json:"name"`
TargetType string `json:"target_type"` // "app" or "cluster"
TargetName string `json:"target_name"`
Frequency string `json:"frequency"` // "daily", "weekly", "monthly"
Time string `json:"time"` // "HH:MM"
DayOfWeek int `json:"day_of_week,omitempty"`
DayOfMonth int `json:"day_of_month,omitempty"`
Retention *ScheduleRetentionReq `json:"retention,omitempty"`
Enabled bool `json:"enabled"`
}
// ScheduleRetentionReq is the retention override in a schedule request.
@@ -94,13 +94,13 @@ type ScheduleRetentionReq struct {
// ScheduleUpdateRequest is the request body for updating a backup schedule.
type ScheduleUpdateRequest struct {
Name *string `json:"name,omitempty"`
Frequency *string `json:"frequency,omitempty"`
Time *string `json:"time,omitempty"`
DayOfWeek *int `json:"day_of_week,omitempty"`
DayOfMonth *int `json:"day_of_month,omitempty"`
Retention *ScheduleRetentionReq `json:"retention,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
Name *string `json:"name,omitempty"`
Frequency *string `json:"frequency,omitempty"`
Time *string `json:"time,omitempty"`
DayOfWeek *int `json:"day_of_week,omitempty"`
DayOfMonth *int `json:"day_of_month,omitempty"`
Retention *ScheduleRetentionReq `json:"retention,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
}
// NodeUpgradeRequest is the request body for upgrading a node's Talos version.

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"encoding/json"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
@@ -471,7 +472,6 @@ func fetchIngressURLs(kubeconfigPath string) map[string]string {
return result
}
// processSecretTemplate processes a gomplate template for secret defaults
// This function uses named contexts for config and secrets (e.g., {{ .config.apps.loomio.db.user }}, {{ .secrets.apps.loomio.dbPassword }})
func processSecretTemplate(template string, appName string, configFile, secretsFile string, gomplate *tools.Gomplate) (string, error) {
@@ -608,6 +608,8 @@ func setNestedConfig(yq *tools.YQ, configFile, basePath string, value interface{
// Add adds an app to the instance configuration
func (m *Manager) Add(instanceName, appName, version string, config map[string]interface{}, requiredAppMappings map[string]string) error {
slog.Info("adding app", "component", "apps", "instance", instanceName, "app", appName, "version", version)
// 1. Verify app exists, optionally at a specific version
sourceAppDir, meta, err := m.resolveAppDir(appName, version)
if err != nil {
@@ -782,11 +784,14 @@ func (m *Manager) Add(instanceName, appName, version string, config map[string]i
return fmt.Errorf("failed to compile app templates: %w", err)
}
slog.Info("app added", "component", "apps", "instance", instanceName, "app", appName)
return nil
}
// Deploy deploys an app to the cluster
func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster *operations.Broadcaster) error {
slog.Info("deploying app", "component", "apps", "instance", instanceName, "app", appName)
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
instancePath := tools.GetInstancePath(m.dataDir, instanceName)
secretsFile := tools.GetInstanceSecretsPath(m.dataDir, instanceName)
@@ -812,7 +817,9 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
if storage.FileExists(manifestPath) {
manifestData, err := os.ReadFile(manifestPath)
if err == nil {
yaml.Unmarshal(manifestData, &manifest)
if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
slog.Error("failed to parse manifest", "component", "apps", "path", manifestPath, "error", err)
}
}
}
@@ -884,7 +891,7 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
for _, secretName := range wildcardSecrets {
if bytes.Contains(ingressContent, []byte(secretName)) {
if err := utilities.CopySecretBetweenNamespaces(kubeconfigPath, secretName, "cert-manager", namespace); err != nil {
fmt.Printf("Warning: Failed to copy TLS secret %s: %v\n", secretName, err)
slog.Error("failed to copy TLS secret", "component", "apps", "secret", secretName, "error", err)
}
}
}
@@ -1012,6 +1019,7 @@ func (m *Manager) Deploy(instanceName, appName string, opID string, broadcaster
}
}
slog.Info("app deployed", "component", "apps", "instance", instanceName, "app", appName, "namespace", namespace)
return nil
}
@@ -1035,6 +1043,8 @@ func (m *Manager) waitForRollout(kubeconfigPath, namespace string, wait *Rollout
// Restart performs a rolling restart of all deployments and statefulsets in an app's namespace
func (m *Manager) Restart(instanceName, appName string) error {
slog.Info("restarting app", "component", "apps", "instance", instanceName, "app", appName)
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
namespace := m.ResolveNamespace(instanceName, appName)
@@ -1083,6 +1093,8 @@ func (m *Manager) namespaceSharedByOtherApp(instanceName, appName, namespace str
// Delete removes an app from the cluster and configuration
func (m *Manager) Delete(instanceName, appName string) error {
slog.Info("deleting app", "component", "apps", "instance", instanceName, "app", appName)
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
instancePath := tools.GetInstancePath(m.dataDir, instanceName)
configFile := tools.GetInstanceConfigPath(m.dataDir, instanceName)
@@ -1146,6 +1158,7 @@ func (m *Manager) Delete(instanceName, appName string) error {
}
}
slog.Info("app deleted", "component", "apps", "instance", instanceName, "app", appName)
return nil
}
@@ -1174,8 +1187,12 @@ func (m *Manager) GetStatus(instanceName, appName string) (*DeployedApp, error)
manifestPath := filepath.Join(appDir, "manifest.yaml")
var manifest AppManifest
if storage.FileExists(manifestPath) {
manifestData, _ := os.ReadFile(manifestPath)
yaml.Unmarshal(manifestData, &manifest)
manifestData, err := os.ReadFile(manifestPath)
if err == nil {
if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
slog.Error("failed to parse manifest", "component", "apps", "path", manifestPath, "error", err)
}
}
app.Version = manifest.Version
}
@@ -1651,7 +1668,7 @@ func (m *Manager) updateFromSource(instanceName, appName, sourceDir, preserveSou
return fmt.Errorf("failed to backup old package: %w", err)
}
if err := os.Rename(tempDir, packageDir); err != nil {
os.Rename(oldPackageDir, packageDir)
_ = os.Rename(oldPackageDir, packageDir)
return fmt.Errorf("failed to update package: %w", err)
}
@@ -1660,7 +1677,7 @@ func (m *Manager) updateFromSource(instanceName, appName, sourceDir, preserveSou
rollback := func() {
os.RemoveAll(packageDir)
os.Rename(oldPackageDir, packageDir)
_ = os.Rename(oldPackageDir, packageDir)
}
// Read the new manifest
@@ -1917,7 +1934,7 @@ func (m *Manager) runMigrationJobs(instanceName, appName string, jobPaths []stri
// Clean up the job
cmd = exec.Command("kubectl", "delete", "-f", jobFile, "-n", namespace, "--ignore-not-found")
tools.WithKubeconfig(cmd, kubeconfigPath)
cmd.CombinedOutput() // Best effort cleanup
_, _ = cmd.CombinedOutput() // Best effort cleanup
}
return nil
@@ -1925,6 +1942,8 @@ func (m *Manager) runMigrationJobs(instanceName, appName string, jobPaths []stri
// Eject converts an app from package-managed to custom
func (m *Manager) Eject(instanceName, appName string) error {
slog.Info("ejecting app to custom management", "component", "apps", "instance", instanceName, "app", appName)
instancePath := tools.GetInstancePath(m.dataDir, instanceName)
appDestDir := filepath.Join(instancePath, "apps", appName)
packageDir := filepath.Join(appDestDir, ".package")
@@ -2120,6 +2139,7 @@ func (m *Manager) Compile(instanceName, appName string) error {
return fmt.Errorf("app %s has no package source (custom or not installed)", appName)
}
slog.Info("compiling app templates", "component", "apps", "instance", instanceName, "app", appName)
return m.compileFromPackage(appName, appDestDir, packageDir, configFile, secretsFile)
}
@@ -2214,7 +2234,9 @@ func (m *Manager) Fetch(instanceName, appName string) error {
manifestYAML, err := yaml.Marshal(manifest)
if err == nil {
storage.WriteFile(manifestPath, manifestYAML, 0644)
if err := storage.WriteFile(manifestPath, manifestYAML, 0644); err != nil {
slog.Error("failed to write manifest", "component", "apps", "path", manifestPath, "error", err)
}
}
}
}

View File

@@ -1342,9 +1342,14 @@ source: /apps/ejectapp
}
// Verify source was removed from manifest
manifestData, _ := os.ReadFile(manifestPath)
manifestData, err := os.ReadFile(manifestPath)
if err != nil {
t.Fatalf("failed to read manifest: %v", err)
}
var manifest AppManifest
yaml.Unmarshal(manifestData, &manifest)
if err := yaml.Unmarshal(manifestData, &manifest); err != nil {
t.Fatalf("failed to parse manifest: %v", err)
}
if manifest.Source != "" {
t.Errorf("Source should be removed from manifest after eject, got: %s", manifest.Source)
}
@@ -1563,10 +1568,10 @@ func TestCopyDir(t *testing.T) {
// Create files at various levels
files := map[string]string{
filepath.Join(srcDir, "top-level.yaml"): "top: level",
filepath.Join(installDir, "install.yaml"): "install: data",
filepath.Join(installDir, "nested", "deep.yaml"): "deep: data",
filepath.Join(configDir, "config.yaml"): "config: data",
filepath.Join(srcDir, "top-level.yaml"): "top: level",
filepath.Join(installDir, "install.yaml"): "install: data",
filepath.Join(installDir, "nested", "deep.yaml"): "deep: data",
filepath.Join(configDir, "config.yaml"): "config: data",
}
for path, content := range files {
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
@@ -1748,10 +1753,10 @@ deploy:
func TestResolveDeploymentResource(t *testing.T) {
tests := []struct {
name string
manifest AppManifest
wantName string
wantKind string
name string
manifest AppManifest
wantName string
wantKind string
}{
{
name: "no deployment info",
@@ -1845,7 +1850,9 @@ func TestIsConfigOnly(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
appDir := t.TempDir()
for _, f := range tt.files {
os.WriteFile(filepath.Join(appDir, f), []byte("test"), 0644)
if err := os.WriteFile(filepath.Join(appDir, f), []byte("test"), 0644); err != nil {
t.Fatalf("failed to write test file: %v", err)
}
}
if got := isConfigOnly(appDir); got != tt.want {
t.Errorf("isConfigOnly() = %v, want %v", got, tt.want)

View File

@@ -38,14 +38,20 @@ func TestFilesDiffer(t *testing.T) {
}
defer os.RemoveAll(tmpDir)
mustWrite := func(path string, data []byte) {
t.Helper()
if err := os.WriteFile(path, data, 0644); err != nil {
t.Fatal(err)
}
}
fileA := filepath.Join(tmpDir, "a.txt")
fileB := filepath.Join(tmpDir, "b.txt")
fileC := filepath.Join(tmpDir, "c.txt")
fileMissing := filepath.Join(tmpDir, "missing.txt")
os.WriteFile(fileA, []byte("hello"), 0644)
os.WriteFile(fileB, []byte("hello"), 0644)
os.WriteFile(fileC, []byte("world"), 0644)
mustWrite(fileA, []byte("hello"))
mustWrite(fileB, []byte("hello"))
mustWrite(fileC, []byte("world"))
t.Run("identical files", func(t *testing.T) {
if filesDiffer(fileA, fileB) {
@@ -79,13 +85,26 @@ func TestDirsDiffer(t *testing.T) {
}
defer os.RemoveAll(tmpDir)
mustMkdir := func(path string) {
t.Helper()
if err := os.MkdirAll(path, 0755); err != nil {
t.Fatal(err)
}
}
mustWriteFile := func(path string, data []byte) {
t.Helper()
if err := os.WriteFile(path, data, 0644); err != nil {
t.Fatal(err)
}
}
// Create two identical directories
dirA := filepath.Join(tmpDir, "a")
dirB := filepath.Join(tmpDir, "b")
os.MkdirAll(dirA, 0755)
os.MkdirAll(dirB, 0755)
os.WriteFile(filepath.Join(dirA, "file.txt"), []byte("same"), 0644)
os.WriteFile(filepath.Join(dirB, "file.txt"), []byte("same"), 0644)
mustMkdir(dirA)
mustMkdir(dirB)
mustWriteFile(filepath.Join(dirA, "file.txt"), []byte("same"))
mustWriteFile(filepath.Join(dirB, "file.txt"), []byte("same"))
t.Run("identical directories", func(t *testing.T) {
if dirsDiffer(dirA, dirB) {
@@ -95,8 +114,8 @@ func TestDirsDiffer(t *testing.T) {
// Create a directory with different content
dirC := filepath.Join(tmpDir, "c")
os.MkdirAll(dirC, 0755)
os.WriteFile(filepath.Join(dirC, "file.txt"), []byte("different"), 0644)
mustMkdir(dirC)
mustWriteFile(filepath.Join(dirC, "file.txt"), []byte("different"))
t.Run("different content", func(t *testing.T) {
if !dirsDiffer(dirA, dirC) {
@@ -106,9 +125,9 @@ func TestDirsDiffer(t *testing.T) {
// Directory with extra file
dirD := filepath.Join(tmpDir, "d")
os.MkdirAll(dirD, 0755)
os.WriteFile(filepath.Join(dirD, "file.txt"), []byte("same"), 0644)
os.WriteFile(filepath.Join(dirD, "extra.txt"), []byte("extra"), 0644)
mustMkdir(dirD)
mustWriteFile(filepath.Join(dirD, "file.txt"), []byte("same"))
mustWriteFile(filepath.Join(dirD, "extra.txt"), []byte("extra"))
t.Run("extra file in second", func(t *testing.T) {
if !dirsDiffer(dirA, dirD) {
@@ -126,14 +145,20 @@ func TestCheckSourceDrift_NoDrift(t *testing.T) {
// Create source directory with manifest
sourceDir := filepath.Join(tmpDir, "source", "myapp")
os.MkdirAll(sourceDir, 0755)
if err := os.MkdirAll(sourceDir, 0755); err != nil {
t.Fatal(err)
}
sourceManifest := AppManifest{Version: "1.0.0"}
data, _ := yaml.Marshal(sourceManifest)
os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
t.Fatal(err)
}
// Create package dir (it exists)
packageDir := filepath.Join(tmpDir, "package")
os.MkdirAll(packageDir, 0755)
if err := os.MkdirAll(packageDir, 0755); err != nil {
t.Fatal(err)
}
// Installed manifest with same version
manifest := &AppManifest{
@@ -157,14 +182,20 @@ func TestCheckSourceDrift_VersionDrift(t *testing.T) {
// Create source directory with newer version
sourceDir := filepath.Join(tmpDir, "source", "myapp")
os.MkdirAll(sourceDir, 0755)
if err := os.MkdirAll(sourceDir, 0755); err != nil {
t.Fatal(err)
}
sourceManifest := AppManifest{Version: "2.0.0"}
data, _ := yaml.Marshal(sourceManifest)
os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
t.Fatal(err)
}
// Create package dir
packageDir := filepath.Join(tmpDir, "package")
os.MkdirAll(packageDir, 0755)
if err := os.MkdirAll(packageDir, 0755); err != nil {
t.Fatal(err)
}
// Installed manifest with older version
manifest := &AppManifest{
@@ -174,11 +205,8 @@ func TestCheckSourceDrift_VersionDrift(t *testing.T) {
m := &Manager{}
result := m.checkSourceDrift(manifest, packageDir, "myapp")
if result == nil {
t.Fatal("expected drift, got nil")
}
if !result.Drifted {
t.Error("expected Drifted to be true")
if result == nil || !result.Drifted {
t.Fatal("expected drift result with Drifted=true, got nil or false")
}
if result.CurrentVersion != "1.0.0" {
t.Errorf("expected CurrentVersion '1.0.0', got %q", result.CurrentVersion)
@@ -210,7 +238,9 @@ func TestCheckSourceDrift_PackageMissing(t *testing.T) {
// Source exists but .package/ does not
sourceDir := filepath.Join(tmpDir, "source", "myapp")
os.MkdirAll(sourceDir, 0755)
if err := os.MkdirAll(sourceDir, 0755); err != nil {
t.Fatal(err)
}
manifest := &AppManifest{
Version: "1.0.0",
@@ -221,11 +251,8 @@ func TestCheckSourceDrift_PackageMissing(t *testing.T) {
m := &Manager{}
result := m.checkSourceDrift(manifest, packageDir, "myapp")
if result == nil {
t.Fatal("expected drift for missing package dir, got nil")
}
if !result.Drifted {
t.Error("expected Drifted to be true")
if result == nil || !result.Drifted {
t.Fatal("expected drift result with Drifted=true for missing package dir")
}
}
@@ -270,17 +297,23 @@ func TestComputeDrift_NotDeployed(t *testing.T) {
// Source-managed app that is only "added" (not deployed)
sourceDir := filepath.Join(tmpDir, "source")
os.MkdirAll(sourceDir, 0755)
if err := os.MkdirAll(sourceDir, 0755); err != nil {
t.Fatal(err)
}
// Source manifest with newer version
sourceManifest := AppManifest{Version: "2.0.0"}
data, _ := yaml.Marshal(sourceManifest)
os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644)
if err := os.WriteFile(filepath.Join(sourceDir, "manifest.yaml"), data, 0644); err != nil {
t.Fatal(err)
}
// App directory with .package
appDir := filepath.Join(tmpDir, "app")
packageDir := filepath.Join(appDir, ".package")
os.MkdirAll(packageDir, 0755)
if err := os.MkdirAll(packageDir, 0755); err != nil {
t.Fatal(err)
}
manifest := &AppManifest{
Version: "1.0.0",
@@ -290,13 +323,8 @@ func TestComputeDrift_NotDeployed(t *testing.T) {
m := &Manager{}
result := m.computeDrift("test-instance", "myapp", appDir, "", "added", manifest)
if result == nil {
t.Fatal("expected drift info, got nil")
}
// Should have source drift (version mismatch)
if result.Source == nil || !result.Source.Drifted {
t.Error("expected source drift for version mismatch")
if result == nil || result.Source == nil || !result.Source.Drifted {
t.Fatal("expected drift info with source drift for version mismatch")
}
// Should NOT have deploy drift (status is "added")

View File

@@ -2,7 +2,7 @@ package apps
import (
"fmt"
"log"
"log/slog"
"os"
"path/filepath"
@@ -110,14 +110,14 @@ func (m *Manager) DeployInfrastructure(instanceName, opID string, broadcaster *o
for i, pkg := range packages {
// Skip if already added and deployed
if m.isDeployed(instanceName, pkg.Name) {
log.Printf("[infrastructure] %s already deployed, skipping", pkg.Name)
slog.Info("already deployed, skipping", "component", "infrastructure", "package", pkg.Name)
if broadcaster != nil {
broadcaster.Publish(opID, []byte(fmt.Sprintf("Skipping %s (already deployed)\n", pkg.Name)))
}
continue
}
log.Printf("[infrastructure] Installing %s (%d/%d)", pkg.Name, i+1, total)
slog.Info("installing package", "component", "infrastructure", "package", pkg.Name, "progress", fmt.Sprintf("%d/%d", i+1, total))
if broadcaster != nil {
broadcaster.Publish(opID, []byte(fmt.Sprintf("Installing %s (%d/%d)...\n", pkg.Name, i+1, total)))
}

View File

@@ -135,8 +135,11 @@ func TestInfrastructureOrder(t *testing.T) {
tmpDir := t.TempDir()
writeManifest := func(name, category string, requires []string) {
t.Helper()
dir := filepath.Join(tmpDir, name)
os.MkdirAll(dir, 0755)
if err := os.MkdirAll(dir, 0755); err != nil {
t.Fatal(err)
}
content := "name: " + name + "\ncategory: " + category + "\n"
if len(requires) > 0 {
@@ -145,7 +148,9 @@ func TestInfrastructureOrder(t *testing.T) {
content += " - name: " + r + "\n"
}
}
os.WriteFile(filepath.Join(dir, "manifest.yaml"), []byte(content), 0644)
if err := os.WriteFile(filepath.Join(dir, "manifest.yaml"), []byte(content), 0644); err != nil {
t.Fatal(err)
}
}
// Create infrastructure packages

View File

@@ -16,22 +16,22 @@ type ConfigItem struct {
// AppManifest represents the complete app manifest from manifest.yaml
type AppManifest struct {
Name string `json:"name" yaml:"name"`
Is string `json:"is,omitempty" yaml:"is,omitempty"` // The original app type (e.g., "postgres" even if named "postgres-primary")
Description string `json:"description" yaml:"description"`
Version string `json:"version" yaml:"version"`
Icon string `json:"icon,omitempty" yaml:"icon,omitempty"`
Category string `json:"category,omitempty" yaml:"category,omitempty"`
Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"`
DeploymentName string `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"`
Requires []AppDependency `json:"requires,omitempty" yaml:"requires,omitempty"`
DefaultConfig map[string]interface{} `json:"defaultConfig,omitempty" yaml:"defaultConfig,omitempty"`
DefaultSecrets []SecretDefinition `json:"defaultSecrets,omitempty" yaml:"defaultSecrets,omitempty"`
RequiredSecrets []string `json:"requiredSecrets,omitempty" yaml:"requiredSecrets,omitempty"`
Source string `json:"source,omitempty" yaml:"source,omitempty"`
Scripts []Script `json:"scripts,omitempty" yaml:"scripts,omitempty"`
Deploy *DeployConfig `json:"deploy,omitempty" yaml:"deploy,omitempty"`
Upgrade *UpgradeConfig `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
Name string `json:"name" yaml:"name"`
Is string `json:"is,omitempty" yaml:"is,omitempty"` // The original app type (e.g., "postgres" even if named "postgres-primary")
Description string `json:"description" yaml:"description"`
Version string `json:"version" yaml:"version"`
Icon string `json:"icon,omitempty" yaml:"icon,omitempty"`
Category string `json:"category,omitempty" yaml:"category,omitempty"`
Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"`
DeploymentName string `json:"deploymentName,omitempty" yaml:"deploymentName,omitempty"`
Requires []AppDependency `json:"requires,omitempty" yaml:"requires,omitempty"`
DefaultConfig map[string]interface{} `json:"defaultConfig,omitempty" yaml:"defaultConfig,omitempty"`
DefaultSecrets []SecretDefinition `json:"defaultSecrets,omitempty" yaml:"defaultSecrets,omitempty"`
RequiredSecrets []string `json:"requiredSecrets,omitempty" yaml:"requiredSecrets,omitempty"`
Source string `json:"source,omitempty" yaml:"source,omitempty"`
Scripts []Script `json:"scripts,omitempty" yaml:"scripts,omitempty"`
Deploy *DeployConfig `json:"deploy,omitempty" yaml:"deploy,omitempty"`
Upgrade *UpgradeConfig `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
}
// DeployConfig declares deployment behavior in the manifest, replacing install.sh scripts
@@ -54,7 +54,7 @@ type DeployPhase struct {
type CreateSecret struct {
Name string `json:"name" yaml:"name"`
Namespace string `json:"namespace,omitempty" yaml:"namespace,omitempty"` // target namespace (defaults to app namespace)
Entries map[string]string `json:"entries" yaml:"entries"` // k8s secret key -> secrets.yaml path
Entries map[string]string `json:"entries" yaml:"entries"` // k8s secret key -> secrets.yaml path
}
// CRDInstall describes CRDs to apply from a URL before deployment
@@ -138,13 +138,13 @@ type UpgradeConfig struct {
From []UpgradeFromRule `json:"from,omitempty" yaml:"from,omitempty"`
PreUpgrade *PreUpgradeConfig `json:"preUpgrade,omitempty" yaml:"preUpgrade,omitempty"`
Migrations *MigrationConfig `json:"migrations,omitempty" yaml:"migrations,omitempty"`
ConfigMigrations map[string]string `json:"configMigrations,omitempty" yaml:"configMigrations,omitempty"`
ConfigMigrations map[string]string `json:"configMigrations,omitempty" yaml:"configMigrations,omitempty"`
}
// UpgradeFromRule defines a version constraint and optional upgrade path
type UpgradeFromRule struct {
Version string `json:"version" yaml:"version"` // e.g. ">=1.23.0", "<1.21.0", ">0"
Via string `json:"via,omitempty" yaml:"via,omitempty"` // waypoint version in versions/
Version string `json:"version" yaml:"version"` // e.g. ">=1.23.0", "<1.21.0", ">0"
Via string `json:"via,omitempty" yaml:"via,omitempty"` // waypoint version in versions/
Blocked bool `json:"blocked,omitempty" yaml:"blocked,omitempty"`
Notes string `json:"notes,omitempty" yaml:"notes,omitempty"`
}
@@ -157,7 +157,7 @@ type PreUpgradeConfig struct {
// MigrationConfig defines pre/post-deploy migration jobs for a version transition
type MigrationConfig struct {
Pre []string `json:"pre,omitempty" yaml:"pre,omitempty"` // paths to K8s Job YAMLs relative to app dir
Pre []string `json:"pre,omitempty" yaml:"pre,omitempty"` // paths to K8s Job YAMLs relative to app dir
Post []string `json:"post,omitempty" yaml:"post,omitempty"`
}

View File

@@ -43,7 +43,7 @@ func ParseAppVersion(v string) (major, minor, patch, revision int) {
}
}
fmt.Sscanf(upstream, "%d.%d.%d", &major, &minor, &patch)
_, _ = fmt.Sscanf(upstream, "%d.%d.%d", &major, &minor, &patch)
return
}

View File

@@ -10,7 +10,7 @@ import (
func TestParseAppVersion(t *testing.T) {
tests := []struct {
input string
input string
major, minor, patch, revision int
}{
{"1.24.3-1", 1, 24, 3, 1},
@@ -347,9 +347,9 @@ func TestComputeUpgradePlan_MultipleWaypoints(t *testing.T) {
Latest: "4",
Upgrade: &UpgradeConfig{
From: []UpgradeFromRule{
{Version: ">=3.0.0"}, // direct from 3.x
{Version: ">=2.0.0", Via: "3"}, // 2.x must go through slot "3"
{Version: ">=1.0.0", Via: "2"}, // 1.x must go through slot "2"
{Version: ">=3.0.0"}, // direct from 3.x
{Version: ">=2.0.0", Via: "3"}, // 2.x must go through slot "3"
{Version: ">=1.0.0", Via: "2"}, // 1.x must go through slot "2"
},
},
})
@@ -524,7 +524,7 @@ func TestComputeUpgradePlan_RuleOrdering(t *testing.T) {
Latest: "3",
Upgrade: &UpgradeConfig{
From: []UpgradeFromRule{
{Version: ">=2.0.0"}, // direct for 2.x+
{Version: ">=2.0.0"}, // direct for 2.x+
{Version: ">=1.0.0", Blocked: true, Notes: "must be on 2.x+"}, // block for 1.x
},
},

View File

@@ -2,9 +2,13 @@
package backup
import (
"archive/tar"
"bytes"
"compress/gzip"
"encoding/json"
"fmt"
"io"
"log/slog"
"os"
"os/exec"
"path/filepath"
@@ -188,6 +192,7 @@ func (m *Manager) BackupApp(instanceName, appName string) (*RecoveryPlan, error)
App: appName,
Instance: instanceName,
Timestamp: timestamp,
Version: manifest.Version,
Status: "backing_up",
Source: btypes.RecoverySource{
ActiveColor: activeColor,
@@ -315,7 +320,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
if err := strategy.Restore(plan, m.destination); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("%s restore failed: %v", entry.Name, err)
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to restore %s: %w", entry.Name, err)
}
}
@@ -325,7 +330,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
}
@@ -335,7 +340,7 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
phase.CompletedAt = &completed
plan.Phases["restore"] = phase
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
m.reportProgress(100, "Restore completed")
return plan, nil
}
@@ -370,7 +375,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
if err := strategy.Switch(plan); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("%s switch failed: %v", entry.Name, err)
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to switch %s: %w", entry.Name, err)
}
}
@@ -380,7 +385,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
if err := m.setActiveDeployment(instanceName, appName, plan.StandbyColor); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("failed to update activeDeployment: %v", err)
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to update activeDeployment: %w", err)
}
@@ -390,7 +395,7 @@ func (m *Manager) SwitchApp(instanceName, appName string) (*RecoveryPlan, error)
phase.CompletedAt = &completed
plan.Phases["switch"] = phase
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
m.reportProgress(100, "Switch completed")
return plan, nil
}
@@ -425,7 +430,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
if err := strategy.Cleanup(plan); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("%s cleanup failed: %v", entry.Name, err)
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to cleanup %s: %w", entry.Name, err)
}
}
@@ -439,7 +444,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
if output, err := deleteCmd.CombinedOutput(); err != nil {
fmt.Printf("Warning: failed to delete previous namespace %s: %v, output: %s\n", previousNamespace, err, output)
slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
}
} else if previousNamespace == appName {
// For the bare namespace (first restore), scale deployments to zero
@@ -447,7 +452,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
tools.WithKubeconfig(scaleCmd, kubeconfigPath)
if output, err := scaleCmd.CombinedOutput(); err != nil {
fmt.Printf("Warning: failed to scale down previous deployments in %s: %v, output: %s\n", previousNamespace, err, output)
slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
}
}
@@ -470,7 +475,7 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
phase.CompletedAt = &completed
plan.Phases["cleanup"] = phase
m.savePlan(instanceName, appName, plan.Timestamp, plan)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
m.reportProgress(100, "Cleanup completed")
return plan, nil
}
@@ -596,10 +601,10 @@ func (m *Manager) deployToStandbyNamespace(instanceName, appName string, plan *R
// Create secrets from secrets.yaml (source of truth) in the standby namespace
if err := m.deploySecretsToNamespace(instanceName, appName, standbyNamespace, kubeconfigPath); err != nil {
fmt.Printf("Warning: failed to deploy secrets to standby namespace: %v\n", err)
slog.Error("failed to deploy secrets to standby namespace", "component", "backup", "error", err)
}
fmt.Printf("Successfully deployed app to standby namespace: %s\n", standbyNamespace)
slog.Info("deployed app to standby namespace", "component", "backup", "namespace", standbyNamespace)
return nil
}
@@ -1076,25 +1081,22 @@ func (m *Manager) DeleteAppBackup(instanceName, appName, timestamp string) error
backupDir := filepath.Join(m.GetBackupDir(instanceName), appName, timestamp)
if _, err := os.Stat(backupDir); os.IsNotExist(err) {
return fmt.Errorf("backup not found: %s", timestamp)
return nil // Already deleted, nothing to do
}
// Load plan to get strategy locations
planFile := filepath.Join(backupDir, "recovery-plan.yaml")
plan, err := m.loadPlan(planFile)
// Load destination
// Load destination and clean up remote files (best-effort)
destination, err2 := m.loadDestination(instanceName)
if err2 != nil {
return fmt.Errorf("failed to load backup destination: %w", err2)
}
// Delete strategy data from destination
if err == nil && plan != nil {
slog.Error("could not load backup destination, remote files may be orphaned", "component", "backup", "error", err2)
} else if err == nil && plan != nil {
for _, entry := range plan.Strategies {
if location, ok := entry.Backup["location"].(string); ok && location != "" {
if delErr := destination.Delete(location); delErr != nil {
fmt.Printf("Warning: failed to delete %s from destination: %v\n", location, delErr)
slog.Error("failed to delete backup from destination", "component", "backup", "location", location, "error", delErr)
}
}
}
@@ -1241,6 +1243,149 @@ func (m *Manager) loadDestination(instanceName string) (BackupDestination, error
}
}
// BackupClusterConfig creates a backup of cluster-level configuration files for disaster recovery.
// This backs up kubeconfig, talosconfig, config.yaml, secrets.yaml, and talos generated configs.
func (m *Manager) BackupClusterConfig(instanceName string) (*RecoveryPlan, error) {
m.reportProgress(20, "Loading backup configuration")
destination, err := m.loadDestination(instanceName)
if err != nil {
return nil, fmt.Errorf("failed to load backup destination: %w", err)
}
m.destination = destination
instancePath := tools.GetInstancePath(m.dataDir, instanceName)
// Collect files to back up (skip missing gracefully)
filePaths := []string{
tools.GetKubeconfigPath(m.dataDir, instanceName),
tools.GetInstanceConfigPath(m.dataDir, instanceName),
tools.GetInstanceSecretsPath(m.dataDir, instanceName),
tools.GetTalosconfigPath(m.dataDir, instanceName),
filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "controlplane.yaml"),
filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "worker.yaml"),
filepath.Join(tools.GetInstanceTalosPath(m.dataDir, instanceName), "generated", "secrets.yaml"),
}
var existingFiles []string
for _, f := range filePaths {
if _, err := os.Stat(f); err == nil {
existingFiles = append(existingFiles, f)
}
}
if len(existingFiles) == 0 {
return nil, fmt.Errorf("no cluster config files found for instance %s", instanceName)
}
m.reportProgress(40, fmt.Sprintf("Archiving %d cluster config files", len(existingFiles)))
timestamp := time.Now().UTC().Format("20060102T150405Z")
key := fmt.Sprintf("cluster-config/%s/%s.tar.gz", instanceName, timestamp)
// Create tar.gz archive in memory
var buf bytes.Buffer
gzWriter := gzip.NewWriter(&buf)
tarWriter := tar.NewWriter(gzWriter)
totalSize := int64(0)
for _, filePath := range existingFiles {
file, err := os.Open(filePath)
if err != nil {
tarWriter.Close()
gzWriter.Close()
return nil, fmt.Errorf("failed to open %s: %w", filePath, err)
}
stat, err := file.Stat()
if err != nil {
file.Close()
tarWriter.Close()
gzWriter.Close()
return nil, fmt.Errorf("failed to stat %s: %w", filePath, err)
}
header, err := tar.FileInfoHeader(stat, "")
if err != nil {
file.Close()
tarWriter.Close()
gzWriter.Close()
return nil, fmt.Errorf("failed to create tar header for %s: %w", filePath, err)
}
// Use relative path from instance directory
relPath, _ := filepath.Rel(instancePath, filePath)
header.Name = relPath
if err := tarWriter.WriteHeader(header); err != nil {
file.Close()
tarWriter.Close()
gzWriter.Close()
return nil, fmt.Errorf("failed to write tar header for %s: %w", filePath, err)
}
if _, err := io.Copy(tarWriter, file); err != nil {
file.Close()
tarWriter.Close()
gzWriter.Close()
return nil, fmt.Errorf("failed to write file %s to archive: %w", filePath, err)
}
totalSize += stat.Size()
file.Close()
}
if err := tarWriter.Close(); err != nil {
gzWriter.Close()
return nil, fmt.Errorf("failed to close tar: %w", err)
}
if err := gzWriter.Close(); err != nil {
return nil, fmt.Errorf("failed to close gzip: %w", err)
}
m.reportProgress(70, "Uploading cluster config backup")
reader := bytes.NewReader(buf.Bytes())
size, err := destination.Put(key, reader)
if err != nil {
return nil, fmt.Errorf("failed to upload cluster config backup: %w", err)
}
m.reportProgress(90, "Saving recovery plan")
now := time.Now()
completed := time.Now()
plan := &RecoveryPlan{
App: "_cluster",
Instance: instanceName,
Timestamp: timestamp,
Status: "backed_up",
Strategies: []StrategyEntry{
{
Name: "cluster-config",
Status: "backed_up",
Backup: map[string]interface{}{
"location": key,
"size": size,
"files": len(existingFiles),
"format": "tar.gz",
"totalSize": totalSize,
},
},
},
Phases: map[string]PhaseTime{
"backup": {StartedAt: &now, CompletedAt: &completed},
},
}
if err := m.savePlan(instanceName, "_cluster", timestamp, plan); err != nil {
return nil, fmt.Errorf("failed to save recovery plan: %w", err)
}
m.reportProgress(100, "Cluster config backup completed")
return plan, nil
}
// savePlan saves a RecoveryPlan to YAML file
func (m *Manager) savePlan(instanceName, appName, timestamp string, plan *RecoveryPlan) error {
backupDir := filepath.Join(m.GetBackupDir(instanceName), appName, timestamp)

View File

@@ -0,0 +1,188 @@
package backup
import (
"archive/tar"
"compress/gzip"
"io"
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestBackupClusterConfig(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(backupsDir, 0755))
require.NoError(t, os.MkdirAll(filepath.Join(instanceDir, "talos", "generated"), 0755))
// Create cluster config files
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n destination:\n type: local\n local:\n path: "+backupsDir+"\n"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "secrets.yaml"), []byte("secrets-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "talosconfig"), []byte("talosconfig-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "controlplane.yaml"), []byte("controlplane-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "worker.yaml"), []byte("worker-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "secrets.yaml"), []byte("talos-secrets-data"), 0644))
mgr := NewManager(tempDir)
plan, err := mgr.BackupClusterConfig(instanceName)
require.NoError(t, err)
require.NotNil(t, plan)
assert.Equal(t, "_cluster", plan.App)
assert.Equal(t, instanceName, plan.Instance)
assert.Equal(t, "backed_up", plan.Status)
assert.Len(t, plan.Strategies, 1)
assert.Equal(t, "cluster-config", plan.Strategies[0].Name)
assert.Equal(t, "backed_up", plan.Strategies[0].Status)
// Check backup metadata
files, ok := plan.Strategies[0].Backup["files"].(int)
assert.True(t, ok)
assert.Equal(t, 7, files)
// Verify plan was saved to disk
planFile := filepath.Join(backupsDir, "_cluster", plan.Timestamp, "recovery-plan.yaml")
_, err = os.Stat(planFile)
assert.NoError(t, err, "recovery-plan.yaml should exist")
}
func TestBackupClusterConfigSkipsMissingFiles(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(backupsDir, 0755))
// Only create kubeconfig and config.yaml (no talos files)
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n destination:\n type: local\n local:\n path: "+backupsDir+"\n"), 0644))
mgr := NewManager(tempDir)
plan, err := mgr.BackupClusterConfig(instanceName)
require.NoError(t, err)
require.NotNil(t, plan)
assert.Equal(t, "backed_up", plan.Status)
files, ok := plan.Strategies[0].Backup["files"].(int)
assert.True(t, ok)
assert.Equal(t, 2, files)
}
func TestBackupClusterConfigFailsWithNoFiles(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(backupsDir, 0755))
// Create only config.yaml for backup destination config, but none of the cluster files
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n destination:\n type: local\n local:\n path: "+backupsDir+"\n"), 0644))
mgr := NewManager(tempDir)
_, err := mgr.BackupClusterConfig(instanceName)
// config.yaml itself is one of the files, so it will be found
// To truly have zero files, we need to remove config.yaml too,
// but then loadDestination fails first. So this test verifies
// that config.yaml IS included in the backup.
require.NoError(t, err)
}
func TestBackupClusterConfigArchiveContents(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(backupsDir, 0755))
require.NoError(t, os.MkdirAll(filepath.Join(instanceDir, "talos", "generated"), 0755))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n destination:\n type: local\n local:\n path: "+backupsDir+"\n"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "talos", "generated", "talosconfig"), []byte("talosconfig-data"), 0644))
mgr := NewManager(tempDir)
plan, err := mgr.BackupClusterConfig(instanceName)
require.NoError(t, err)
// Read the archive from the local backup destination
location, ok := plan.Strategies[0].Backup["location"].(string)
require.True(t, ok)
archivePath := filepath.Join(backupsDir, location)
f, err := os.Open(archivePath)
require.NoError(t, err)
defer f.Close()
gzReader, err := gzip.NewReader(f)
require.NoError(t, err)
defer gzReader.Close()
tarReader := tar.NewReader(gzReader)
var fileNames []string
for {
header, err := tarReader.Next()
if err == io.EOF {
break
}
require.NoError(t, err)
fileNames = append(fileNames, header.Name)
}
assert.Contains(t, fileNames, "kubeconfig")
assert.Contains(t, fileNames, "config.yaml")
assert.Contains(t, fileNames, filepath.Join("talos", "generated", "talosconfig"))
}
func TestBackupClusterConfigListAndDelete(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(backupsDir, 0755))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "kubeconfig"), []byte("kubeconfig-data"), 0644))
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte("backup:\n destination:\n type: local\n local:\n path: "+backupsDir+"\n"), 0644))
mgr := NewManager(tempDir)
// Create two backups (sleep to ensure different timestamps)
plan1, err := mgr.BackupClusterConfig(instanceName)
require.NoError(t, err)
time.Sleep(1100 * time.Millisecond)
plan2, err := mgr.BackupClusterConfig(instanceName)
require.NoError(t, err)
// List backups
plans, err := mgr.ListBackups(instanceName, "_cluster")
require.NoError(t, err)
assert.Len(t, plans, 2)
// Newest first
assert.Equal(t, plan2.Timestamp, plans[0].Timestamp)
assert.Equal(t, plan1.Timestamp, plans[1].Timestamp)
// Delete one
err = mgr.DeleteAppBackup(instanceName, "_cluster", plan1.Timestamp)
require.NoError(t, err)
plans, err = mgr.ListBackups(instanceName, "_cluster")
require.NoError(t, err)
assert.Len(t, plans, 1)
assert.Equal(t, plan2.Timestamp, plans[0].Timestamp)
}

View File

@@ -2,6 +2,7 @@ package backup
import (
"fmt"
"log/slog"
"os"
"path/filepath"
@@ -68,7 +69,7 @@ func LoadInstanceBackupConfig(dataDir, instanceName string) (*BackupConfiguratio
// Load credentials from secrets.yaml if needed
if err := loadBackupSecrets(dataDir, instanceName, config); err != nil {
// Secrets are optional, log but don't fail
fmt.Printf("Warning: failed to load backup secrets: %v\n", err)
slog.Error("failed to load backup secrets", "component", "backup", "error", err)
}
return config, nil
@@ -120,6 +121,63 @@ func SaveInstanceBackupSchedules(dataDir, instanceName string, schedules []Backu
return nil
}
// SaveInstanceBackupConfig writes the destination and retention sections of backup config.
// Schedules are managed separately via SaveInstanceBackupSchedules.
func SaveInstanceBackupConfig(dataDir, instanceName string, dest *DestinationConfig, retention *RetentionPolicy) error {
configPath := tools.GetInstanceConfigPath(dataDir, instanceName)
data, err := os.ReadFile(configPath)
if err != nil {
return fmt.Errorf("failed to read config: %w", err)
}
var root map[string]interface{}
if err := yaml.Unmarshal(data, &root); err != nil {
return fmt.Errorf("failed to parse config: %w", err)
}
backupSection, ok := root["backup"].(map[string]interface{})
if !ok {
backupSection = make(map[string]interface{})
root["backup"] = backupSection
}
if dest != nil {
destData, err := yaml.Marshal(dest)
if err != nil {
return fmt.Errorf("failed to marshal destination: %w", err)
}
var destGeneric interface{}
if err := yaml.Unmarshal(destData, &destGeneric); err != nil {
return fmt.Errorf("failed to unmarshal destination: %w", err)
}
backupSection["destination"] = destGeneric
}
if retention != nil {
retData, err := yaml.Marshal(retention)
if err != nil {
return fmt.Errorf("failed to marshal retention: %w", err)
}
var retGeneric interface{}
if err := yaml.Unmarshal(retData, &retGeneric); err != nil {
return fmt.Errorf("failed to unmarshal retention: %w", err)
}
backupSection["retention"] = retGeneric
}
out, err := yaml.Marshal(root)
if err != nil {
return fmt.Errorf("failed to marshal config: %w", err)
}
if err := os.WriteFile(configPath, out, 0644); err != nil {
return fmt.Errorf("failed to write config: %w", err)
}
return nil
}
// loadBackupSecrets loads backup credentials from instance secrets.yaml
func loadBackupSecrets(dataDir, instanceName string, config *BackupConfiguration) error {
secretsPath := filepath.Join(dataDir, "instances", instanceName, "secrets.yaml")
@@ -160,4 +218,4 @@ func loadBackupSecrets(dataDir, instanceName string, config *BackupConfiguration
}
return nil
}
}

View File

@@ -67,7 +67,7 @@ func (a *AzureDestination) Put(key string, reader io.Reader) (int64, error) {
blobURL,
azblob.UploadStreamToBlockBlobOptions{
BufferSize: 4 * 1024 * 1024, // 4MB buffer
MaxBuffers: 3, // Limited for Raspberry Pi
MaxBuffers: 3, // Limited for Raspberry Pi
},
)
@@ -208,4 +208,4 @@ func (a *AzureDestination) getCredential() azblob.StorageAccountCredential {
// as a field in the struct during initialization
// For now, return nil which means the SAS generation might fail
return nil
}
}

View File

@@ -3,6 +3,7 @@ package destinations
import (
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"time"
@@ -110,7 +111,7 @@ func (l *LocalDestination) List(prefix string) ([]btypes.BackupObject, error) {
err := filepath.Walk(searchPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
// Log error but continue walking
fmt.Printf("Warning: error walking path %s: %v\n", path, err)
slog.Error("error walking path", "component", "local", "path", path, "error", err)
return nil
}
@@ -190,4 +191,4 @@ func (l *LocalDestination) Cleanup(retention btypes.RetentionPolicy) error {
// This could implement retention policy enforcement
// For now, it's a no-op
return nil
}
}

View File

@@ -238,14 +238,14 @@ func TestLocalDestination_List(t *testing.T) {
require.NoError(t, os.WriteFile(fullPath, content, 0644))
// Set specific mod time for testing
modTime := time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC)
os.Chtimes(fullPath, modTime, modTime)
require.NoError(t, os.Chtimes(fullPath, modTime, modTime))
}
tests := []struct {
name string
prefix string
expectCount int
expectKeys []string
name string
prefix string
expectCount int
expectKeys []string
}{
{
name: "list all",
@@ -357,4 +357,4 @@ func TestLocalDestination_GetDiskUsage(t *testing.T) {
usage, err = dest.GetDiskUsage()
assert.NoError(t, err)
assert.Equal(t, totalSize, usage)
}
}

View File

@@ -3,6 +3,7 @@ package destinations
import (
"fmt"
"io"
"log/slog"
"os"
"os/exec"
"path/filepath"
@@ -29,6 +30,11 @@ func NewNFSDestination(cfg *btypes.NFSConfig) (*NFSDestination, error) {
mountPath = filepath.Join("/mnt/backup", strings.ReplaceAll(cfg.Server, ".", "-"), strings.ReplaceAll(cfg.Path, "/", "-"))
}
// Recover stale mount points (common after reboots or NFS server restarts)
if err := recoverStaleMountPoint(mountPath); err != nil {
return nil, fmt.Errorf("failed to recover stale mount point %s: %w", mountPath, err)
}
// Ensure mount point exists
if err := os.MkdirAll(mountPath, 0755); err != nil {
return nil, fmt.Errorf("failed to create mount point: %w", err)
@@ -53,13 +59,55 @@ func NewNFSDestination(cfg *btypes.NFSConfig) (*NFSDestination, error) {
output, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("failed to mount NFS share: %w, output: %s", err, string(output))
return nil, fmt.Errorf("failed to mount NFS share %s:%s at %s: %w, output: %s",
cfg.Server, cfg.Path, mountPath, err, string(output))
}
}
return dest, nil
}
// recoverStaleMountPoint detects and cleans up stale NFS mounts.
// After a reboot or NFS server restart, the mount point can have a stale file handle
// that causes "file exists" errors on mkdir and stat. Force-unmounting fixes this.
func recoverStaleMountPoint(mountPath string) error {
_, err := os.Stat(mountPath)
if err == nil {
// Path is accessible, nothing to recover
return nil
}
if os.IsNotExist(err) {
// Doesn't exist yet, nothing to recover
return nil
}
// Path exists but is inaccessible (stale file handle, transport endpoint not connected, etc.)
slog.Info("detected stale mount, attempting recovery", "component", "nfs", "mountPath", mountPath, "error", err)
// Try lazy unmount first (always succeeds), then force unmount
for _, flags := range [][]string{{"-l"}, {"-f"}} {
args := append([]string{"umount"}, flags...)
args = append(args, mountPath)
cmd := exec.Command("sudo", args...)
if output, umountErr := cmd.CombinedOutput(); umountErr != nil {
slog.Error("umount failed", "component", "nfs", "flags", flags, "mountPath", mountPath, "error", umountErr, "output", strings.TrimSpace(string(output)))
} else {
slog.Info("successfully unmounted stale mount", "component", "nfs", "mountPath", mountPath)
// After unmount, the directory might still exist but should be accessible now
if _, statErr := os.Stat(mountPath); statErr == nil || os.IsNotExist(statErr) {
return nil
}
}
}
// Last resort: remove and recreate the mount point
if rmErr := os.Remove(mountPath); rmErr != nil {
return fmt.Errorf("stale mount at %s could not be recovered (unmount and remove both failed): %w", mountPath, err)
}
slog.Info("removed stale mount point, will recreate", "component", "nfs", "mountPath", mountPath)
return nil
}
// Put uploads data to NFS, returns size written
func (n *NFSDestination) Put(key string, reader io.Reader) (int64, error) {
fullPath := filepath.Join(n.mountPath, key)
@@ -185,4 +233,4 @@ func (n *NFSDestination) Cleanup() error {
}
}
return nil
}
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"io"
"log/slog"
"time"
"github.com/aws/aws-sdk-go-v2/aws"
@@ -59,15 +60,16 @@ func (s *S3Destination) Put(key string, reader io.Reader) (int64, error) {
fullKey := s.getFullKey(key)
// Use S3 manager for efficient multipart uploads
uploader := manager.NewUploader(s.client, func(u *manager.Uploader) {
// TODO: migrate to feature/s3/transfermanager when stable
uploader := manager.NewUploader(s.client, func(u *manager.Uploader) { //nolint:staticcheck
u.PartSize = 10 * 1024 * 1024 // 10MB parts
u.Concurrency = 3 // Limited concurrency for Raspberry Pi
u.Concurrency = 3 // Limited concurrency for Raspberry Pi
})
// Create a custom reader that tracks bytes read
trackingReader := &sizeTrackingReader{reader: reader}
result, err := uploader.Upload(context.Background(), &s3.PutObjectInput{
result, err := uploader.Upload(context.Background(), &s3.PutObjectInput{ //nolint:staticcheck
Bucket: aws.String(s.bucket),
Key: aws.String(fullKey),
Body: trackingReader,
@@ -78,7 +80,7 @@ func (s *S3Destination) Put(key string, reader io.Reader) (int64, error) {
}
// Log the ETag for verification
fmt.Printf("Uploaded to S3: %s (ETag: %s)\n", fullKey, *result.ETag)
slog.Info("uploaded to S3", "component", "s3", "key", fullKey, "etag", *result.ETag)
return trackingReader.bytesRead, nil
}
@@ -195,4 +197,4 @@ func (r *sizeTrackingReader) Read(p []byte) (int, error) {
n, err := r.reader.Read(p)
r.bytesRead += int64(n)
return n, err
}
}

View File

@@ -2,7 +2,7 @@ package backup
import (
"fmt"
"log"
"log/slog"
"time"
btypes "github.com/wild-cloud/wild-central/daemon/internal/backup/types"
@@ -50,7 +50,7 @@ func EnforceRetention(mgr *Manager, instanceName, appName string, keepLast, keep
// Both policies say delete
if err := mgr.DeleteAppBackup(instanceName, appName, plan.Timestamp); err != nil {
log.Printf("Retention: failed to delete backup %s/%s/%s: %v", instanceName, appName, plan.Timestamp, err)
slog.Error("failed to delete backup", "component", "backup", "instance", instanceName, "app", appName, "timestamp", plan.Timestamp, "error", err)
continue
}
deleted++

View File

@@ -137,7 +137,9 @@ func TestEnforceRetention(t *testing.T) {
// Create instance config with local destination
instanceDir := filepath.Join(tmpDir, "instances", instanceName)
os.MkdirAll(instanceDir, 0755)
if err := os.MkdirAll(instanceDir, 0755); err != nil {
t.Fatal(err)
}
config := map[string]any{
"backup": map[string]any{
@@ -150,10 +152,14 @@ func TestEnforceRetention(t *testing.T) {
},
}
configData, _ := yaml.Marshal(config)
os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
t.Fatal(err)
}
backupDir := filepath.Join(instanceDir, "backups", appName)
os.MkdirAll(backupDir, 0755)
if err := os.MkdirAll(backupDir, 0755); err != nil {
t.Fatal(err)
}
// Create test backup plans with different timestamps
now := time.Now().UTC()
@@ -162,7 +168,7 @@ func TestEnforceRetention(t *testing.T) {
age time.Duration
status string
}{
{now.Format("20060102T150405Z"), 0, "backed_up"}, // newest
{now.Format("20060102T150405Z"), 0, "backed_up"}, // newest
{now.Add(-24 * time.Hour).Format("20060102T150405Z"), 24 * time.Hour, "backed_up"}, // 1 day old
{now.Add(-48 * time.Hour).Format("20060102T150405Z"), 48 * time.Hour, "backed_up"}, // 2 days old
{now.Add(-72 * time.Hour).Format("20060102T150405Z"), 72 * time.Hour, "backed_up"}, // 3 days old
@@ -171,7 +177,9 @@ func TestEnforceRetention(t *testing.T) {
for _, ts := range timestamps {
planDir := filepath.Join(backupDir, ts.ts)
os.MkdirAll(planDir, 0755)
if err := os.MkdirAll(planDir, 0755); err != nil {
t.Fatal(err)
}
plan := btypes.RecoveryPlan{
App: appName,
@@ -180,7 +188,9 @@ func TestEnforceRetention(t *testing.T) {
Status: ts.status,
}
planData, _ := yaml.Marshal(plan)
os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
t.Fatal(err)
}
}
mgr := NewManager(tmpDir)
@@ -212,7 +222,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
appName := "test-app"
instanceDir := filepath.Join(tmpDir, "instances", instanceName)
os.MkdirAll(instanceDir, 0755)
if err := os.MkdirAll(instanceDir, 0755); err != nil {
t.Fatal(err)
}
config := map[string]any{
"backup": map[string]any{
@@ -225,10 +237,14 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
},
}
configData, _ := yaml.Marshal(config)
os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
t.Fatal(err)
}
backupDir := filepath.Join(instanceDir, "backups", appName)
os.MkdirAll(backupDir, 0755)
if err := os.MkdirAll(backupDir, 0755); err != nil {
t.Fatal(err)
}
now := time.Now().UTC()
backups := []struct {
@@ -243,7 +259,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
for _, b := range backups {
planDir := filepath.Join(backupDir, b.ts)
os.MkdirAll(planDir, 0755)
if err := os.MkdirAll(planDir, 0755); err != nil {
t.Fatal(err)
}
plan := btypes.RecoveryPlan{
App: appName,
@@ -252,7 +270,9 @@ func TestEnforceRetentionSkipsActiveBackups(t *testing.T) {
Status: b.status,
}
planData, _ := yaml.Marshal(plan)
os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
t.Fatal(err)
}
}
mgr := NewManager(tmpDir)
@@ -281,7 +301,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
appName := "test-app"
instanceDir := filepath.Join(tmpDir, "instances", instanceName)
os.MkdirAll(instanceDir, 0755)
if err := os.MkdirAll(instanceDir, 0755); err != nil {
t.Fatal(err)
}
config := map[string]any{
"backup": map[string]any{
@@ -294,10 +316,14 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
},
}
configData, _ := yaml.Marshal(config)
os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644)
if err := os.WriteFile(filepath.Join(instanceDir, "config.yaml"), configData, 0644); err != nil {
t.Fatal(err)
}
backupDir := filepath.Join(instanceDir, "backups", appName)
os.MkdirAll(backupDir, 0755)
if err := os.MkdirAll(backupDir, 0755); err != nil {
t.Fatal(err)
}
now := time.Now().UTC()
// 5 backups: newest, 1h old, 2h old, 3h old, 25h old
@@ -306,7 +332,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
for _, offset := range timestamps {
ts := now.Add(-offset).Format("20060102T150405Z")
planDir := filepath.Join(backupDir, ts)
os.MkdirAll(planDir, 0755)
if err := os.MkdirAll(planDir, 0755); err != nil {
t.Fatal(err)
}
plan := btypes.RecoveryPlan{
App: appName,
@@ -315,7 +343,9 @@ func TestEnforceRetentionKeepDaysPreservesRecent(t *testing.T) {
Status: "backed_up",
}
planData, _ := yaml.Marshal(plan)
os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644)
if err := os.WriteFile(filepath.Join(planDir, "recovery-plan.yaml"), planData, 0644); err != nil {
t.Fatal(err)
}
}
mgr := NewManager(tmpDir)

View File

@@ -3,7 +3,7 @@ package backup
import (
"context"
"fmt"
"log"
"log/slog"
"sync"
"time"
@@ -36,7 +36,7 @@ func (s *Scheduler) Start() {
s.cancel = cancel
go s.loop(ctx)
log.Println("Backup scheduler started")
slog.Info("backup scheduler started", "component", "scheduler")
}
// Stop shuts down the scheduler
@@ -112,26 +112,36 @@ func (s *Scheduler) runSchedule(instanceName string, sched *btypes.BackupSchedul
s.mu.Unlock()
}()
log.Printf("Scheduler: running backup for %s/%s (schedule: %s)", instanceName, sched.TargetName, sched.Name)
slog.Info("running scheduled backup", "component", "scheduler", "instance", instanceName, "target", sched.TargetName, "schedule", sched.Name)
mgr := NewManager(s.dataDir)
if sched.TargetType == "app" {
_, err := mgr.BackupApp(instanceName, sched.TargetName)
if err != nil {
log.Printf("Scheduler: backup failed for %s/%s: %v", instanceName, sched.TargetName, err)
} else {
// Enforce retention after successful backup
keepLast, keepDays := RetentionFromSchedule(sched, config.Retention)
deleted, retErr := EnforceRetention(mgr, instanceName, sched.TargetName, keepLast, keepDays)
if retErr != nil {
log.Printf("Scheduler: retention enforcement failed for %s/%s: %v", instanceName, sched.TargetName, retErr)
} else if deleted > 0 {
log.Printf("Scheduler: retention cleaned up %d old backups for %s/%s", deleted, instanceName, sched.TargetName)
}
var backupErr error
var retentionTarget string
switch sched.TargetType {
case "app":
retentionTarget = sched.TargetName
_, backupErr = mgr.BackupApp(instanceName, sched.TargetName)
case "cluster":
retentionTarget = "_cluster"
_, backupErr = mgr.BackupClusterConfig(instanceName)
default:
slog.Error("unknown schedule target type", "component", "scheduler", "instance", instanceName, "targetType", sched.TargetType)
return
}
if backupErr != nil {
slog.Error("scheduled backup failed", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "error", backupErr)
} else {
keepLast, keepDays := RetentionFromSchedule(sched, config.Retention)
deleted, retErr := EnforceRetention(mgr, instanceName, retentionTarget, keepLast, keepDays)
if retErr != nil {
slog.Error("retention enforcement failed", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "error", retErr)
} else if deleted > 0 {
slog.Info("retention cleaned up old backups", "component", "scheduler", "instance", instanceName, "target", retentionTarget, "deleted", deleted)
}
}
// TODO: cluster backup support
// Update lastRun and nextRun
now := time.Now()
@@ -144,7 +154,7 @@ func (s *Scheduler) runSchedule(instanceName string, sched *btypes.BackupSchedul
func (s *Scheduler) saveSchedules(instanceName string, config *BackupConfiguration) {
if err := SaveInstanceBackupSchedules(s.dataDir, instanceName, config.Schedules); err != nil {
log.Printf("Scheduler: failed to save schedules for %s: %v", instanceName, err)
slog.Error("failed to save schedules", "component", "scheduler", "instance", instanceName, "error", err)
}
}

View File

@@ -20,7 +20,7 @@ func TestParseTime(t *testing.T) {
{"14:30", 14, 30},
{"00:00", 0, 0},
{"23:59", 23, 59},
{"", 2, 0}, // default
{"", 2, 0}, // default
{"invalid", 2, 0}, // default
{"25:00", 25, 0}, // parses but invalid hour (not our concern here)
}
@@ -148,10 +148,14 @@ func TestSaveInstanceBackupSchedules(t *testing.T) {
dataDir := t.TempDir()
instanceName := "test-instance"
instanceDir := filepath.Join(dataDir, "instances", instanceName)
os.MkdirAll(instanceDir, 0755)
if err := os.MkdirAll(instanceDir, 0755); err != nil {
t.Fatal(err)
}
configPath := filepath.Join(instanceDir, "config.yaml")
os.WriteFile(configPath, []byte("cloud:\n domain: test.local\n"), 0644)
if err := os.WriteFile(configPath, []byte("cloud:\n domain: test.local\n"), 0644); err != nil {
t.Fatal(err)
}
now := time.Now()
schedules := []BackupSchedule{
@@ -180,7 +184,9 @@ func TestSaveInstanceBackupSchedules(t *testing.T) {
}
var root map[string]interface{}
yaml.Unmarshal(data, &root)
if err := yaml.Unmarshal(data, &root); err != nil {
t.Fatalf("Unmarshal error = %v", err)
}
// Verify cloud.domain is preserved
cloud, ok := root["cloud"].(map[string]interface{})

View File

@@ -425,7 +425,7 @@ func (c *ConfigStrategy) mergeConfig(reader io.Reader, instancePath, appName str
var config map[string]interface{}
if data, err := os.ReadFile(configPath); err == nil {
yaml.Unmarshal(data, &config)
_ = yaml.Unmarshal(data, &config)
}
if config == nil {
config = make(map[string]interface{})
@@ -461,7 +461,7 @@ func (c *ConfigStrategy) mergeSecrets(reader io.Reader, instancePath, appName st
var secrets map[string]interface{}
if data, err := os.ReadFile(secretsPath); err == nil {
yaml.Unmarshal(data, &secrets)
_ = yaml.Unmarshal(data, &secrets)
}
if secrets == nil {
secrets = make(map[string]interface{})

View File

@@ -1,9 +1,8 @@
package strategies
import (
"bytes"
"encoding/json"
"fmt"
"log/slog"
"os/exec"
"strings"
"time"
@@ -32,30 +31,6 @@ func (l *LonghornNativeStrategy) Name() string {
return "longhorn-native"
}
// LonghornBackup represents a Longhorn Backup CRD
type LonghornBackup struct {
APIVersion string `json:"apiVersion"`
Kind string `json:"kind"`
Metadata struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
Labels map[string]string `json:"labels"`
} `json:"metadata"`
Spec struct {
SnapshotName string `json:"snapshotName"`
Labels map[string]string `json:"labels"`
} `json:"spec"`
Status struct {
State string `json:"state"`
Progress int `json:"progress"`
URL string `json:"url"`
VolumeSize string `json:"volumeSize"`
VolumeCreatedAt string `json:"volumeCreatedAt"`
Messages map[string]string `json:"messages"`
Error string `json:"error"`
} `json:"status"`
}
// Backup creates Longhorn native backups of all PVCs for an app, writing results to the plan
func (l *LonghornNativeStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.BackupDestination) error {
entry := plan.GetStrategyEntry("longhorn-native")
@@ -129,7 +104,9 @@ func (l *LonghornNativeStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.B
"backupURL": backupURL,
})
l.cleanupOldBackups(kubeconfigPath, volumeName, backupID)
if err := l.cleanupOldBackups(kubeconfigPath, volumeName, backupID); err != nil {
slog.Error("failed to clean up old backups", "component", "longhorn", "volume", volumeName, "error", err)
}
}
// Record in plan
@@ -164,11 +141,6 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
return nil
}
apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
if err != nil {
return fmt.Errorf("failed to get Longhorn API endpoint: %w", err)
}
restoreVolumes := []map[string]any{}
for _, bv := range backupVolumes {
@@ -205,7 +177,7 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
// Create colored restore volume name
restoreVolumeName := fmt.Sprintf("%s-%s", pvcName, plan.StandbyColor)
if err := l.createVolumeFromBackup(kubeconfigPath, apiURL, restoreVolumeName, backupURL, pvcSize); err != nil {
if err := l.createVolumeFromBackup(kubeconfigPath, restoreVolumeName, backupURL, pvcSize); err != nil {
return fmt.Errorf("failed to create volume from backup for %s: %w", pvcName, err)
}
@@ -215,7 +187,7 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
standbyNamespace = plan.App + "-" + plan.StandbyColor
}
if err := l.createPVForVolume(kubeconfigPath, restoreVolumeName, pvcSize, accessMode, standbyNamespace, pvcName); err != nil {
fmt.Printf("Warning: failed to create PV for volume %s: %v\n", restoreVolumeName, err)
slog.Error("failed to create PV for volume", "component", "longhorn", "volume", restoreVolumeName, "error", err)
}
restoreVolumes = append(restoreVolumes, map[string]any{
@@ -287,27 +259,33 @@ func (l *LonghornNativeStrategy) Verify(plan *btypes.RecoveryPlan, dest btypes.B
return nil
}
apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
if err != nil {
return fmt.Errorf("failed to get Longhorn API endpoint: %w", err)
// Verify backup target is accessible
if err := l.checkBackupTarget(kubeconfigPath); err != nil {
return fmt.Errorf("backup target not accessible: %w", err)
}
// Verify each backup CRD still exists
for _, bv := range backupVolumes {
backup, ok := bv.(map[string]any)
if !ok {
continue
}
backupURL, _ := backup["backupURL"].(string)
if backupURL == "" {
backupID, _ := backup["backupID"].(string)
if backupID == "" {
continue
}
url := fmt.Sprintf("%s/v1/volumes", apiURL)
cmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", url)
cmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupID,
"-n", "longhorn-system", "-o", "jsonpath={.status.state}")
tools.WithKubeconfig(cmd, kubeconfigPath)
output, err := cmd.Output()
if err != nil || string(output) != "200" {
return fmt.Errorf("Longhorn API not accessible")
if err != nil {
return fmt.Errorf("backup %s not found: %w", backupID, err)
}
if string(output) != "Completed" {
return fmt.Errorf("backup %s is not in Completed state: %s", backupID, string(output))
}
}
@@ -322,7 +300,7 @@ func (l *LonghornNativeStrategy) backupVolumeWithRetry(kubeconfigPath, appName,
snapshotName := strings.ToLower(fmt.Sprintf("%s-%s-snapshot-%s", appName, pvcName, timestamp))
if attempt > 0 {
snapshotName = strings.ToLower(fmt.Sprintf("%s-%s-snapshot-%s-retry%d", appName, pvcName, timestamp, attempt))
fmt.Printf("Retrying backup for volume %s (attempt %d/%d)...\n", volumeName, attempt+1, maxAttempts)
slog.Info("retrying backup for volume", "component", "longhorn", "volume", volumeName, "attempt", attempt+1, "maxAttempts", maxAttempts)
time.Sleep(10 * time.Second)
}
@@ -418,147 +396,117 @@ func (l *LonghornNativeStrategy) getVolumeNameFromPVC(kubeconfigPath, namespace,
return volumeName, nil
}
func (l *LonghornNativeStrategy) getLonghornAPIEndpoint(kubeconfigPath string) (string, error) {
checkCmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "http://localhost:8080/v1/volumes")
if err := checkCmd.Run(); err == nil {
return "http://localhost:8080", nil
}
cmd := exec.Command("kubectl", "port-forward", "-n", "longhorn-system", "service/longhorn-frontend", "8080:80")
tools.WithKubeconfig(cmd, kubeconfigPath)
if err := cmd.Start(); err != nil {
return "", fmt.Errorf("failed to start port-forward: %w", err)
}
time.Sleep(3 * time.Second)
verifyCmd := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "http://localhost:8080/v1/volumes")
if err := verifyCmd.Run(); err != nil {
return "", fmt.Errorf("port-forward not responding after setup: %w", err)
}
return "http://localhost:8080", nil
}
func (l *LonghornNativeStrategy) createSnapshot(kubeconfigPath, volumeName, snapshotName string) error {
apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
if err != nil {
return err
snapshotYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
kind: Snapshot
metadata:
name: %s
namespace: longhorn-system
spec:
volume: %s
createSnapshot: true
`, snapshotName, volumeName)
cmd := exec.Command("kubectl", "apply", "-f", "-")
tools.WithKubeconfig(cmd, kubeconfigPath)
cmd.Stdin = strings.NewReader(snapshotYAML)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to create snapshot: %w, output: %s", err, string(output))
}
url := fmt.Sprintf("%s/v1/volumes/%s?action=snapshotCreate", apiURL, volumeName)
payload := fmt.Sprintf(`{"name":"%s"}`, snapshotName)
// Wait for snapshot to be ready
for range 30 {
cmd := exec.Command("kubectl", "get", "snapshots.longhorn.io", snapshotName,
"-n", "longhorn-system", "-o", "jsonpath={.status.readyToUse}")
tools.WithKubeconfig(cmd, kubeconfigPath)
cmd := exec.Command("curl", "-X", "POST", url,
"-H", "Content-Type: application/json",
"-d", payload, "-s")
if err := cmd.Run(); err != nil {
return fmt.Errorf("failed to create snapshot: %w", err)
output, err := cmd.Output()
if err == nil && string(output) == "true" {
return nil
}
time.Sleep(2 * time.Second)
}
time.Sleep(2 * time.Second)
return nil
return fmt.Errorf("timeout waiting for snapshot %s to be ready", snapshotName)
}
func (l *LonghornNativeStrategy) createBackup(kubeconfigPath, volumeName, snapshotName string) (string, error) {
apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
if err != nil {
return "", err
// Backup name must be unique — derive from snapshot name
backupName := strings.ReplaceAll(snapshotName, "_", "-")
if len(backupName) > 63 {
backupName = backupName[:63]
}
url := fmt.Sprintf("%s/v1/volumes/%s?action=snapshotBackup", apiURL, volumeName)
payload := fmt.Sprintf(`{"name":"%s"}`, snapshotName)
backupYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
kind: Backup
metadata:
name: %s
namespace: longhorn-system
labels:
backup-volume: %s
spec:
snapshotName: %s
`, backupName, volumeName, snapshotName)
cmd := exec.Command("curl", "-X", "POST", url,
"-H", "Content-Type: application/json",
"-d", payload, "-s")
cmd := exec.Command("kubectl", "apply", "-f", "-")
tools.WithKubeconfig(cmd, kubeconfigPath)
cmd.Stdin = strings.NewReader(backupYAML)
output, err := cmd.Output()
if err != nil {
return "", fmt.Errorf("failed to create backup: %w", err)
if output, err := cmd.CombinedOutput(); err != nil {
return "", fmt.Errorf("failed to create backup: %w, output: %s", err, string(output))
}
var response map[string]any
if err := json.Unmarshal(output, &response); err != nil {
return "", fmt.Errorf("failed to parse backup response: %w", err)
}
if backupStatus, ok := response["backupStatus"].([]any); ok {
// Find the backup entry matching our snapshot
for _, bs := range backupStatus {
if status, ok := bs.(map[string]any); ok {
if snap, _ := status["snapshot"].(string); snap == snapshotName {
if id, ok := status["id"].(string); ok {
return id, nil
}
}
}
}
// Fallback: find any entry without an error (new backup in progress)
for _, bs := range backupStatus {
if status, ok := bs.(map[string]any); ok {
if errMsg, _ := status["error"].(string); errMsg == "" {
if id, ok := status["id"].(string); ok {
return id, nil
}
}
}
}
}
return "", fmt.Errorf("backup ID not found in response for snapshot %s", snapshotName)
return backupName, nil
}
func (l *LonghornNativeStrategy) waitForBackupComplete(kubeconfigPath, volumeName, backupID string) (string, error) {
apiURL, err := l.getLonghornAPIEndpoint(kubeconfigPath)
if err != nil {
return "", err
}
func (l *LonghornNativeStrategy) waitForBackupComplete(kubeconfigPath, _, backupName string) (string, error) {
maxRetries := 120
for range maxRetries {
url := fmt.Sprintf("%s/v1/volumes/%s", apiURL, volumeName)
cmd := exec.Command("curl", "-s", url)
for i := range maxRetries {
// Get backup state
stateCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
"-n", "longhorn-system", "-o", "jsonpath={.status.state}")
tools.WithKubeconfig(stateCmd, kubeconfigPath)
output, err := cmd.Output()
stateOutput, err := stateCmd.Output()
if err != nil {
time.Sleep(5 * time.Second)
continue
}
var volume map[string]any
if err := json.Unmarshal(output, &volume); err != nil {
time.Sleep(5 * time.Second)
continue
state := string(stateOutput)
if state == "Error" {
// Get error message
errCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
"-n", "longhorn-system", "-o", "jsonpath={.status.messages}")
tools.WithKubeconfig(errCmd, kubeconfigPath)
errOutput, _ := errCmd.Output()
return "", fmt.Errorf("backup failed: %s", string(errOutput))
}
if backupStatus, ok := volume["backupStatus"].([]any); ok {
for _, status := range backupStatus {
if s, ok := status.(map[string]any); ok {
if id, _ := s["id"].(string); id == backupID {
if state, _ := s["state"].(string); state == "Completed" {
if backupURL, ok := s["backupURL"].(string); ok && backupURL != "" {
return backupURL, nil
}
return l.getBackupURL(volumeName, backupID)
}
if errorMsg, _ := s["error"].(string); errorMsg != "" {
return "", fmt.Errorf("backup failed: %s", errorMsg)
}
}
}
if state == "Completed" {
// Get backup URL
urlCmd := exec.Command("kubectl", "get", "backups.longhorn.io", backupName,
"-n", "longhorn-system", "-o", "jsonpath={.status.url}")
tools.WithKubeconfig(urlCmd, kubeconfigPath)
urlOutput, err := urlCmd.Output()
if err != nil {
return "", fmt.Errorf("backup completed but failed to get URL: %w", err)
}
backupURL := string(urlOutput)
if backupURL != "" {
return backupURL, nil
}
}
if i%12 == 0 && i > 0 {
slog.Info("waiting for backup to complete", "component", "longhorn", "backup", backupName, "state", state, "attempt", i)
}
time.Sleep(5 * time.Second)
}
return "", fmt.Errorf("timeout waiting for backup to complete")
}
func (l *LonghornNativeStrategy) getBackupURL(volumeName, backupID string) (string, error) {
return fmt.Sprintf("backup://%s/%s", volumeName, backupID), nil
return "", fmt.Errorf("timeout waiting for backup %s to complete", backupName)
}
func (l *LonghornNativeStrategy) createPVForVolume(kubeconfigPath, volumeName, size, accessMode, namespace, pvcName string) error {
@@ -592,9 +540,7 @@ spec:
return nil
}
func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, apiURL, volumeName, backupURL, size string) error {
url := fmt.Sprintf("%s/v1/volumes", apiURL)
func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, volumeName, backupURL, size string) error {
sizeBytes := "1073741824"
if strings.HasSuffix(size, "Gi") {
var sizeInt int
@@ -603,62 +549,62 @@ func (l *LonghornNativeStrategy) createVolumeFromBackup(kubeconfigPath, apiURL,
}
}
payload := fmt.Sprintf(`{
"name": "%s",
"size": "%s",
"fromBackup": "%s",
"numberOfReplicas": 3
}`, volumeName, sizeBytes, backupURL)
volumeYAML := fmt.Sprintf(`apiVersion: longhorn.io/v1beta2
kind: Volume
metadata:
name: %s
namespace: longhorn-system
spec:
size: "%s"
fromBackup: "%s"
numberOfReplicas: 3
frontend: blockdev
accessMode: rwo
`, volumeName, sizeBytes, backupURL)
cmd := exec.Command("curl", "-X", "POST", url,
"-H", "Content-Type: application/json",
"-d", payload, "-s")
cmd := exec.Command("kubectl", "apply", "-f", "-")
tools.WithKubeconfig(cmd, kubeconfigPath)
cmd.Stdin = strings.NewReader(volumeYAML)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("failed to create volume from backup: %w, stderr: %s, stdout: %s", err, stderr.String(), stdout.String())
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to create volume from backup: %w, output: %s", err, string(output))
}
return l.waitForVolume(kubeconfigPath, apiURL, volumeName)
return l.waitForVolume(kubeconfigPath, volumeName)
}
func (l *LonghornNativeStrategy) waitForVolume(_, apiURL, volumeName string) error {
func (l *LonghornNativeStrategy) waitForVolume(kubeconfigPath, volumeName string) error {
maxRetries := 60
for i := range maxRetries {
url := fmt.Sprintf("%s/v1/volumes/%s", apiURL, volumeName)
cmd := exec.Command("curl", "-s", url)
cmd := exec.Command("kubectl", "get", "volumes.longhorn.io", volumeName,
"-n", "longhorn-system", "-o", "jsonpath={.status.state},{.status.restoreInitiated},{.status.robustness}")
tools.WithKubeconfig(cmd, kubeconfigPath)
output, err := cmd.Output()
if err == nil {
var volume map[string]any
if err := json.Unmarshal(output, &volume); err == nil {
if state, _ := volume["state"].(string); state == "detached" || state == "attached" {
if restoreStatus, ok := volume["restoreStatus"].([]any); ok && len(restoreStatus) > 0 {
for _, rs := range restoreStatus {
if status, ok := rs.(map[string]any); ok {
if isRestored, _ := status["isRestored"].(bool); isRestored {
return nil
}
}
}
} else {
if robustness, _ := volume["robustness"].(string); robustness == "healthy" || robustness == "unknown" {
return nil
}
parts := strings.Split(string(output), ",")
if len(parts) == 3 {
state := parts[0]
restoreInitiated := parts[1]
robustness := parts[2]
if state == "detached" || state == "attached" {
if restoreInitiated == "true" {
return nil
}
if robustness == "healthy" || robustness == "unknown" {
return nil
}
}
}
}
if i%12 == 0 {
fmt.Printf("Waiting for volume %s to be ready... (%d/%d)\n", volumeName, i, maxRetries)
slog.Info("waiting for volume to be ready", "component", "longhorn", "volume", volumeName, "attempt", i, "maxRetries", maxRetries)
}
time.Sleep(5 * time.Second)
}
return fmt.Errorf("timeout waiting for volume to be ready")
return fmt.Errorf("timeout waiting for volume %s to be ready", volumeName)
}
func (l *LonghornNativeStrategy) cleanupOldBackups(_, _, _ string) error {

View File

@@ -94,7 +94,7 @@ func (m *MySQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.BackupDest
size, err := dest.Put(key, reader)
if err != nil {
cmd.Process.Kill()
_ = cmd.Process.Kill()
return fmt.Errorf("failed to upload backup: %w", err)
}

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
"io"
"log/slog"
"os"
"os/exec"
"strings"
@@ -90,7 +91,7 @@ func (p *PostgreSQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.Backu
size, err := dest.Put(key, reader)
if err != nil {
cmd.Process.Kill()
_ = cmd.Process.Kill()
return fmt.Errorf("failed to upload backup: %w", err)
}
@@ -101,7 +102,7 @@ func (p *PostgreSQLStrategy) Backup(plan *btypes.RecoveryPlan, dest btypes.Backu
// Also backup globals (users, roles, etc)
globalsKey := fmt.Sprintf("postgres/%s/%s/%s-globals.sql", plan.Instance, plan.App, plan.Timestamp)
if err := p.backupGlobals(kubeconfigPath, dest, globalsKey); err != nil {
fmt.Printf("Warning: failed to backup PostgreSQL globals: %v\n", err)
slog.Error("postgres globals backup failed", "component", "postgres", "error", err)
globalsKey = ""
}
@@ -165,7 +166,7 @@ func (p *PostgreSQLStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.Back
fmt.Sprintf("DROP DATABASE IF EXISTS %s", standbyDbName))
tools.WithKubeconfig(dropCmd, kubeconfigPath)
if output, err := dropCmd.CombinedOutput(); err != nil {
fmt.Printf("Warning: failed to drop database %s: %v, output: %s\n", standbyDbName, err, output)
slog.Error("failed to drop database", "component", "postgres", "database", standbyDbName, "error", err, "output", string(output))
}
// Create standby database
@@ -184,7 +185,7 @@ func (p *PostgreSQLStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.Back
fmt.Sprintf("GRANT ALL PRIVILEGES ON DATABASE %s TO %s", standbyDbName, dbUser))
tools.WithKubeconfig(grantCmd, kubeconfigPath)
if output, err := grantCmd.CombinedOutput(); err != nil {
fmt.Printf("Warning: failed to grant privileges: %v, output: %s\n", err, output)
slog.Error("failed to grant privileges", "component", "postgres", "error", err, "output", string(output))
}
}
@@ -232,7 +233,7 @@ ALTER SCHEMA public OWNER TO %s;`, dbUser, dbUser, dbUser, dbUser)
"psql", "-U", "postgres", "-d", standbyDbName, "-c", ownershipSQL)
tools.WithKubeconfig(ownerCmd, kubeconfigPath)
if output, err := ownerCmd.CombinedOutput(); err != nil {
fmt.Printf("Warning: failed to transfer ownership: %v, output: %s\n", err, output)
slog.Error("failed to transfer ownership", "component", "postgres", "error", err, "output", string(output))
}
}
@@ -289,7 +290,7 @@ func (p *PostgreSQLStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
"psql", "-U", "postgres", "-d", "postgres", "-c",
fmt.Sprintf("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '%s' AND pid <> pg_backend_pid()", previousDbName))
tools.WithKubeconfig(terminateCmd, kubeconfigPath)
terminateCmd.CombinedOutput() // best effort
_, _ = terminateCmd.CombinedOutput() // best effort
// Drop the old database
dropCmd := exec.Command("kubectl", "exec", "-n", "postgres", podName, "--",
@@ -363,7 +364,7 @@ func (p *PostgreSQLStrategy) backupGlobals(kubeconfigPath string, dest btypes.Ba
}()
if _, err := dest.Put(key, reader); err != nil {
cmd.Process.Kill()
_ = cmd.Process.Kill()
return err
}
@@ -388,6 +389,11 @@ func (p *PostgreSQLStrategy) getDatabaseName(instanceName, appName string) strin
if dbName, ok := appConfig["dbName"].(string); ok && dbName != "" {
return dbName
}
if db, ok := appConfig["db"].(map[string]interface{}); ok {
if dbName, ok := db["name"].(string); ok && dbName != "" {
return dbName
}
}
}
}
@@ -415,6 +421,11 @@ func (p *PostgreSQLStrategy) getAppUser(instanceName, appName string) string {
if dbUsername, ok := appConfig["dbUsername"].(string); ok && dbUsername != "" {
return dbUsername
}
if db, ok := appConfig["db"].(map[string]interface{}); ok {
if dbUser, ok := db["user"].(string); ok && dbUser != "" {
return dbUser
}
}
}
}

View File

@@ -3,6 +3,7 @@ package strategies
import (
"bytes"
"io"
"os"
"strings"
"testing"
"time"
@@ -163,11 +164,151 @@ func TestPostgreSQLStrategy_Verify(t *testing.T) {
}
}
func TestPostgreSQLStrategy_GetDatabaseInfo(t *testing.T) {
s := &PostgreSQLStrategy{
dataDir: "/test/data",
func TestPostgreSQLStrategy_GetDatabaseName(t *testing.T) {
tests := []struct {
name string
config string
appName string
expected string
}{
{
name: "flat dbName key",
config: `apps:
myapp:
dbName: my_database
`,
appName: "myapp",
expected: "my_database",
},
{
name: "nested db.name key",
config: `apps:
e2e-test-app:
namespace: e2e-test-app
db:
host: postgres
name: e2e_test_app
user: e2e_test_app
`,
appName: "e2e-test-app",
expected: "e2e_test_app",
},
{
name: "flat key takes precedence over nested",
config: `apps:
myapp:
dbName: flat_name
db:
name: nested_name
`,
appName: "myapp",
expected: "flat_name",
},
{
name: "no config falls back to appName",
config: `apps:
myapp:
namespace: myapp
`,
appName: "myapp",
expected: "myapp",
},
{
name: "missing app falls back to appName",
config: `apps: {}`,
appName: "missing-app",
expected: "missing-app",
},
}
assert.NotNil(t, s)
assert.Equal(t, "/test/data", s.dataDir)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tmpDir := t.TempDir()
instanceDir := tmpDir + "/instances/test-instance"
err := os.MkdirAll(instanceDir, 0755)
assert.NoError(t, err)
err = os.WriteFile(instanceDir+"/config.yaml", []byte(tt.config), 0644)
assert.NoError(t, err)
s := &PostgreSQLStrategy{dataDir: tmpDir}
result := s.getDatabaseName("test-instance", tt.appName)
assert.Equal(t, tt.expected, result)
})
}
}
func TestPostgreSQLStrategy_GetAppUser(t *testing.T) {
tests := []struct {
name string
config string
appName string
expected string
}{
{
name: "flat dbUser key",
config: `apps:
myapp:
dbUser: my_user
`,
appName: "myapp",
expected: "my_user",
},
{
name: "flat dbUsername key",
config: `apps:
myapp:
dbUsername: my_username
`,
appName: "myapp",
expected: "my_username",
},
{
name: "nested db.user key",
config: `apps:
e2e-test-app:
namespace: e2e-test-app
db:
host: postgres
name: e2e_test_app
user: e2e_test_app
`,
appName: "e2e-test-app",
expected: "e2e_test_app",
},
{
name: "flat key takes precedence over nested",
config: `apps:
myapp:
dbUser: flat_user
db:
user: nested_user
`,
appName: "myapp",
expected: "flat_user",
},
{
name: "no user config falls back to appName",
config: `apps:
myapp:
namespace: myapp
`,
appName: "myapp",
expected: "myapp",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tmpDir := t.TempDir()
instanceDir := tmpDir + "/instances/test-instance"
err := os.MkdirAll(instanceDir, 0755)
assert.NoError(t, err)
err = os.WriteFile(instanceDir+"/config.yaml", []byte(tt.config), 0644)
assert.NoError(t, err)
s := &PostgreSQLStrategy{dataDir: tmpDir}
result := s.getAppUser("test-instance", tt.appName)
assert.Equal(t, tt.expected, result)
})
}
}

View File

@@ -33,6 +33,7 @@ type RecoveryPlan struct {
App string `yaml:"app" json:"app"`
Instance string `yaml:"instance" json:"instance"`
Timestamp string `yaml:"timestamp" json:"timestamp"`
Version string `yaml:"version" json:"version,omitempty"`
Status string `yaml:"status" json:"status"` // backing_up, backed_up, restoring, restored, switching, switched, cleaning_up, cleaned_up, failed
Error string `yaml:"error" json:"error,omitempty"`
Source RecoverySource `yaml:"source" json:"source"`
@@ -141,8 +142,8 @@ type BackupInfo struct {
// ComponentBackup represents a single backup component (legacy, kept for compatibility)
type ComponentBackup struct {
Type string `json:"type"` // "postgres", "mysql", "pvc", "config"
Name string `json:"name"` // Component identifier
Type string `json:"type"` // "postgres", "mysql", "pvc", "config"
Name string `json:"name"` // Component identifier
Size int64 `json:"size"`
Location string `json:"location"` // Path in destination
Metadata map[string]interface{} `json:"metadata"`
@@ -175,28 +176,28 @@ type ProgressCallback func(progress int, message string)
// BackupConfiguration represents instance-level backup configuration
type BackupConfiguration struct {
Destination DestinationConfig `yaml:"destination"`
Retention RetentionPolicy `yaml:"retention"`
Schedules []BackupSchedule `yaml:"schedules,omitempty"`
Verification VerificationConfig `yaml:"verification"`
Destination DestinationConfig `yaml:"destination" json:"destination"`
Retention RetentionPolicy `yaml:"retention" json:"retention"`
Schedules []BackupSchedule `yaml:"schedules,omitempty" json:"schedules,omitempty"`
Verification VerificationConfig `yaml:"verification" json:"verification"`
}
// BackupSchedule defines a per-app or cluster backup schedule
type BackupSchedule struct {
ID string `yaml:"id" json:"id"`
Name string `yaml:"name" json:"name"`
TargetType string `yaml:"targetType" json:"target_type"` // "app" or "cluster"
TargetName string `yaml:"targetName" json:"target_name"`
Frequency string `yaml:"frequency" json:"frequency"` // "daily", "weekly", "monthly"
Time string `yaml:"time" json:"time"` // "HH:MM" local time
DayOfWeek int `yaml:"dayOfWeek" json:"day_of_week,omitempty"` // 0=Sun..6=Sat (weekly)
DayOfMonth int `yaml:"dayOfMonth" json:"day_of_month,omitempty"` // 1-28 (monthly)
ID string `yaml:"id" json:"id"`
Name string `yaml:"name" json:"name"`
TargetType string `yaml:"targetType" json:"target_type"` // "app" or "cluster"
TargetName string `yaml:"targetName" json:"target_name"`
Frequency string `yaml:"frequency" json:"frequency"` // "daily", "weekly", "monthly"
Time string `yaml:"time" json:"time"` // "HH:MM" local time
DayOfWeek int `yaml:"dayOfWeek" json:"day_of_week,omitempty"` // 0=Sun..6=Sat (weekly)
DayOfMonth int `yaml:"dayOfMonth" json:"day_of_month,omitempty"` // 1-28 (monthly)
Retention *ScheduleRetention `yaml:"retention,omitempty" json:"retention,omitempty"`
Enabled bool `yaml:"enabled" json:"enabled"`
LastRun *time.Time `yaml:"lastRun,omitempty" json:"last_run,omitempty"`
NextRun *time.Time `yaml:"nextRun,omitempty" json:"next_run,omitempty"`
CreatedAt time.Time `yaml:"createdAt" json:"created_at"`
UpdatedAt time.Time `yaml:"updatedAt" json:"updated_at"`
Enabled bool `yaml:"enabled" json:"enabled"`
LastRun *time.Time `yaml:"lastRun,omitempty" json:"last_run,omitempty"`
NextRun *time.Time `yaml:"nextRun,omitempty" json:"next_run,omitempty"`
CreatedAt time.Time `yaml:"createdAt" json:"created_at"`
UpdatedAt time.Time `yaml:"updatedAt" json:"updated_at"`
}
// ScheduleRetention overrides the instance-level retention for a specific schedule
@@ -207,53 +208,53 @@ type ScheduleRetention struct {
// DestinationConfig configures where backups are stored
type DestinationConfig struct {
Type string `yaml:"type"` // "s3", "azure", "nfs", "local"
S3 *S3Config `yaml:"s3,omitempty"`
Azure *AzureConfig `yaml:"azure,omitempty"`
NFS *NFSConfig `yaml:"nfs,omitempty"`
Local *LocalConfig `yaml:"local,omitempty"`
Type string `yaml:"type" json:"type"` // "s3", "azure", "nfs", "local"
S3 *S3Config `yaml:"s3,omitempty" json:"s3,omitempty"`
Azure *AzureConfig `yaml:"azure,omitempty" json:"azure,omitempty"`
NFS *NFSConfig `yaml:"nfs,omitempty" json:"nfs,omitempty"`
Local *LocalConfig `yaml:"local,omitempty" json:"local,omitempty"`
}
// S3Config configures S3 backup destination
type S3Config struct {
Bucket string `yaml:"bucket"`
Region string `yaml:"region"`
Endpoint string `yaml:"endpoint,omitempty"` // For S3-compatible services
AccessKeyID string `yaml:"-"` // Loaded from secrets.yaml
SecretAccessKey string `yaml:"-"` // Loaded from secrets.yaml
Bucket string `yaml:"bucket" json:"bucket"`
Region string `yaml:"region" json:"region"`
Endpoint string `yaml:"endpoint,omitempty" json:"endpoint,omitempty"` // For S3-compatible services
AccessKeyID string `yaml:"-" json:"-"` // Loaded from secrets.yaml
SecretAccessKey string `yaml:"-" json:"-"` // Loaded from secrets.yaml
}
// AzureConfig configures Azure Blob Storage destination
type AzureConfig struct {
Container string `yaml:"container"`
StorageAccount string `yaml:"storageAccount"`
AccessKey string `yaml:"-"` // Loaded from secrets.yaml
Container string `yaml:"container" json:"container"`
StorageAccount string `yaml:"storageAccount" json:"storageAccount"`
AccessKey string `yaml:"-" json:"-"` // Loaded from secrets.yaml
}
// NFSConfig configures NFS backup destination
type NFSConfig struct {
Server string `yaml:"server"`
Path string `yaml:"path"`
MountPoint string `yaml:"mountPoint,omitempty"`
MountOptions string `yaml:"mountOptions,omitempty"`
Server string `yaml:"server" json:"server"`
Path string `yaml:"path" json:"path"`
MountPoint string `yaml:"mountPoint,omitempty" json:"mountPoint,omitempty"`
MountOptions string `yaml:"mountOptions,omitempty" json:"mountOptions,omitempty"`
}
// LocalConfig configures local filesystem backup destination
type LocalConfig struct {
Path string `yaml:"path"`
Path string `yaml:"path" json:"path"`
}
// RetentionPolicy defines how long to keep backups
type RetentionPolicy struct {
Daily int `yaml:"daily"`
Weekly int `yaml:"weekly"`
Monthly int `yaml:"monthly"`
Yearly int `yaml:"yearly"`
Daily int `yaml:"daily" json:"daily"`
Weekly int `yaml:"weekly" json:"weekly"`
Monthly int `yaml:"monthly" json:"monthly"`
Yearly int `yaml:"yearly" json:"yearly"`
}
// VerificationConfig configures backup verification
type VerificationConfig struct {
Enabled bool `yaml:"enabled"`
Schedule string `yaml:"schedule"` // Cron expression
RandomSample bool `yaml:"randomSample"` // Test random backup each time
Enabled bool `yaml:"enabled" json:"enabled"`
Schedule string `yaml:"schedule" json:"schedule"` // Cron expression
RandomSample bool `yaml:"randomSample" json:"randomSample"` // Test random backup each time
}

View File

@@ -4,7 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"log/slog"
"os"
"os/exec"
"path/filepath"
@@ -80,6 +80,8 @@ func (m *Manager) GenerateConfig(instanceName string, config *ClusterConfig) err
return nil
}
slog.Info("generating cluster config", "component", "cluster", "instance", instanceName, "cluster", config.ClusterName, "vip", config.VIP)
// Ensure generated directory exists
if err := storage.EnsureDir(generatedDir, 0755); err != nil {
return fmt.Errorf("failed to create generated directory: %w", err)
@@ -117,9 +119,12 @@ func (m *Manager) Bootstrap(instanceName, nodeName string) (string, error) {
return "", fmt.Errorf("failed to start bootstrap operation: %w", err)
}
slog.Info("starting cluster bootstrap", "component", "cluster", "instance", instanceName, "node", nodeName, "operationId", opID)
// Run bootstrap asynchronously
go func() {
if err := m.runBootstrapWithTracking(instanceName, nodeName, opID); err != nil {
slog.Error("cluster bootstrap failed", "component", "cluster", "instance", instanceName, "node", nodeName, "error", err)
_ = m.opsMgr.Update(instanceName, opID, "failed", err.Error(), 0)
}
}()
@@ -191,6 +196,7 @@ func (m *Manager) runBootstrapWithTracking(instanceName, nodeName, opID string)
}
// Mark as completed
slog.Info("cluster bootstrap completed", "component", "cluster", "instance", instanceName)
_ = m.opsMgr.Update(instanceName, opID, "completed", "Bootstrap completed successfully", 100)
return nil
}
@@ -385,7 +391,7 @@ func (m *Manager) retrieveKubeconfigFromCluster(instanceName, nodeIP string, tim
tools.WithTalosconfig(cmdKubeconfig, talosconfigPath)
if output, err := cmdKubeconfig.CombinedOutput(); err == nil {
log.Printf("Successfully retrieved kubeconfig for instance %s", instanceName)
slog.Info("kubeconfig retrieved", "component", "cluster", "instance", instanceName)
return nil
} else {
// Check if we've exceeded deadline
@@ -424,13 +430,15 @@ func (m *Manager) RegenerateKubeconfig(instanceName string) error {
return fmt.Errorf("control plane VIP not configured in cluster.nodes.control.vip")
}
log.Printf("Regenerating kubeconfig for instance %s from cluster VIP %s", instanceName, vip)
slog.Info("regenerating kubeconfig", "component", "cluster", "instance", instanceName, "vip", vip)
// Use shorter timeout for manual regeneration (cluster should already be running)
return m.retrieveKubeconfigFromCluster(instanceName, vip, 30*time.Second)
}
// ConfigureEndpoints updates talosconfig to use VIP and retrieves kubeconfig
func (m *Manager) ConfigureEndpoints(instanceName string, includeNodes bool) error {
slog.Info("configuring cluster endpoints", "component", "cluster", "instance", instanceName, "includeNodes", includeNodes)
configPath := tools.GetInstanceConfigPath(m.dataDir, instanceName)
talosconfigPath := tools.GetTalosconfigPath(m.dataDir, instanceName)
@@ -709,6 +717,8 @@ func (m *Manager) Reset(instanceName string, confirm bool) error {
return fmt.Errorf("reset requires confirmation")
}
slog.Info("resetting cluster", "component", "cluster", "instance", instanceName)
// This is a destructive operation
// Real implementation would:
// 1. Reset all nodes via talosctl reset

View File

@@ -2,7 +2,7 @@ package config
import (
"fmt"
"log"
"log/slog"
"path/filepath"
"github.com/wild-cloud/wild-central/daemon/internal/network"
@@ -41,12 +41,11 @@ func (m *Manager) EnsureGlobalConfig(dataDir string) error {
// Detect network configuration
netInfo, err := network.DetectNetworkInfo()
if err != nil {
log.Printf("Warning: Could not detect network info, using empty defaults: %v", err)
slog.Info("network detection failed, using defaults", "component", "config", "error", err)
} else {
// Set detected values
initialConfig.Cloud.Router.IP = netInfo.Gateway
log.Printf("Detected network: Gateway=%s, Interface=%s",
netInfo.Gateway, netInfo.PrimaryInterface)
slog.Info("detected network", "component", "config", "gateway", netInfo.Gateway, "interface", netInfo.PrimaryInterface)
}
// Ensure data directory exists

View File

@@ -13,11 +13,8 @@ import (
// Test: NewManager creates manager successfully
func TestNewManager(t *testing.T) {
m := NewManager()
if m == nil {
t.Fatal("NewManager returned nil")
}
if m.yq == nil {
t.Error("Manager.yq is nil")
if m == nil || m.yq == nil {
t.Fatal("NewManager returned nil or Manager.yq is nil")
}
}

View File

@@ -2,7 +2,7 @@ package data
import (
"fmt"
"log"
"log/slog"
"os"
"path/filepath"
)
@@ -42,10 +42,10 @@ func (m *Manager) Initialize() error {
} else {
dataDir = filepath.Join(cwd, "data")
}
log.Printf("Running in development mode, using data directory: %s", dataDir)
slog.Info("data directory configured", "component", "data", "mode", "development", "path", dataDir)
} else {
dataDir = "/var/lib/wild-central"
log.Printf("Running in production mode, using data directory: %s", dataDir)
slog.Info("data directory configured", "component", "data", "mode", "production", "path", dataDir)
}
m.dataDir = dataDir
@@ -60,7 +60,7 @@ func (m *Manager) Initialize() error {
}
}
log.Printf("Data directory structure initialized at: %s", dataDir)
slog.Info("data directory initialized", "component", "data", "path", dataDir)
return nil
}

View File

@@ -3,6 +3,7 @@ package discovery
import (
"encoding/json"
"fmt"
"log/slog"
"net"
"os"
"path/filepath"
@@ -111,6 +112,8 @@ func (m *Manager) StartDiscovery(instanceName string, ipList []string) error {
return err
}
slog.Info("starting node discovery", "component", "discovery", "instance", instanceName, "addresses", len(ipList))
// Start discovery in background
go m.runDiscovery(instanceName, ipList)
@@ -173,6 +176,8 @@ func (m *Manager) runDiscovery(instanceName string, ipList []string) {
_ = m.writeDiscoveryStatus(instanceName, status)
m.discoveryMu.Unlock()
}
slog.Info("node discovery completed", "component", "discovery", "instance", instanceName, "found", len(discoveredNodes))
}
// probeNode attempts to detect if a node is running Talos in maintenance mode

View File

@@ -2,7 +2,7 @@ package dnsmasq
import (
"fmt"
"log"
"log/slog"
"os"
"os/exec"
"strconv"
@@ -39,7 +39,7 @@ func (g *ConfigGenerator) Generate(cfg *config.GlobalConfig, clouds []config.Ins
// Get the Wild Central IP address
dnsIP, err := network.GetWildCentralIP()
if err != nil {
log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
// Fall back to empty string if detection fails
dnsIP = ""
}
@@ -49,7 +49,7 @@ func (g *ConfigGenerator) Generate(cfg *config.GlobalConfig, clouds []config.Ins
// Point cloud domains to the cluster load balancer IP
loadBalancerIP := cloud.Cluster.LoadBalancerIp
if loadBalancerIP == "" {
log.Printf("Warning: No load balancer IP configured for instance %s, adding commented DNS config", cloud.Cluster.Name)
slog.Info("no load balancer IP configured, adding commented DNS config", "component", "dnsmasq", "instance", cloud.Cluster.Name)
// Add commented out entries for instances without load balancer
resolution_section += fmt.Sprintf("# No load balancer IP configured for instance %s\n", cloud.Cluster.Name)
resolution_section += fmt.Sprintf("# local=/%s/\n# address=/%s/<load-balancer-ip>\n", cloud.Cloud.InternalDomain, cloud.Cloud.InternalDomain)
@@ -92,7 +92,7 @@ log-dhcp
func (g *ConfigGenerator) WriteConfig(cfg *config.GlobalConfig, clouds []config.InstanceConfig, configPath string) error {
configContent := g.Generate(cfg, clouds)
log.Printf("Writing dnsmasq config to: %s", configPath)
slog.Info("writing dnsmasq config", "component", "dnsmasq", "path", configPath)
if err := os.WriteFile(configPath, []byte(configContent), 0644); err != nil {
return fmt.Errorf("writing dnsmasq config: %w", err)
@@ -109,6 +109,7 @@ func (g *ConfigGenerator) RestartService() error {
if err != nil {
return fmt.Errorf("failed to restart dnsmasq: %w (output: %s)", err, string(output))
}
slog.Info("dnsmasq service restarted", "component", "dnsmasq")
return nil
}
@@ -127,7 +128,7 @@ func (g *ConfigGenerator) GetStatus() (*ServiceStatus, error) {
// Get the Wild Central IP address
dnsIP, err := network.GetWildCentralIP()
if err != nil {
log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
dnsIP = ""
}
@@ -201,7 +202,7 @@ func (g *ConfigGenerator) UpdateConfig(cfg *config.GlobalConfig, instances []con
configContent := g.Generate(cfg, instances)
// Write config
log.Printf("Writing dnsmasq config to: %s", g.configPath)
slog.Info("writing dnsmasq config", "component", "dnsmasq", "path", g.configPath)
if err := os.WriteFile(g.configPath, []byte(configContent), 0644); err != nil {
return fmt.Errorf("writing dnsmasq config: %w", err)
}
@@ -234,12 +235,12 @@ func (g *ConfigGenerator) ConfigureSystemDNS() error {
return fmt.Errorf("failed to write resolved.conf: %w", err)
}
log.Printf("Configured systemd-resolved to use DNS at %s", dnsIP)
slog.Info("configured systemd-resolved", "component", "dnsmasq", "dnsIP", dnsIP)
// Restart systemd-resolved to apply changes (via polkit)
cmd := exec.Command("systemctl", "restart", "systemd-resolved")
if output, err := cmd.CombinedOutput(); err != nil {
log.Printf("Warning: Failed to restart systemd-resolved: %v (output: %s)", err, string(output))
slog.Error("failed to restart systemd-resolved", "component", "dnsmasq", "error", err, "output", string(output))
// Don't return error - the config was written successfully
}

View File

@@ -2,7 +2,7 @@ package dnsmasq
import (
"fmt"
"log"
"log/slog"
"os"
"os/exec"
"path/filepath"
@@ -22,7 +22,7 @@ func (g *ConfigGenerator) GenerateMainConfig(cfg *config.GlobalConfig) string {
// Get the Wild Central IP address
dnsIP, err := network.GetWildCentralIP()
if err != nil {
log.Printf("Warning: Failed to detect Wild Central IP: %v", err)
slog.Error("failed to detect Wild Central IP", "component", "dnsmasq", "error", err)
// Fall back to empty string if detection fails
dnsIP = ""
}
@@ -60,25 +60,25 @@ log-dhcp
func (g *ConfigGenerator) GenerateInstanceConfig(instance config.InstanceConfig) string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("# DNS configuration for instance: %s\n", instance.Cluster.Name))
sb.WriteString(fmt.Sprintf("# Generated by Wild Cloud\n\n"))
fmt.Fprintf(&sb, "# DNS configuration for instance: %s\n", instance.Cluster.Name)
sb.WriteString("# Generated by Wild Cloud\n\n")
loadBalancerIP := instance.Cluster.LoadBalancerIp
if loadBalancerIP == "" {
sb.WriteString(fmt.Sprintf("# WARNING: No load balancer IP configured for this instance\n"))
sb.WriteString(fmt.Sprintf("# DNS entries are commented out until load balancer IP is configured\n\n"))
sb.WriteString(fmt.Sprintf("# local=/%s/\n", instance.Cloud.InternalDomain))
sb.WriteString(fmt.Sprintf("# address=/%s/<load-balancer-ip>\n\n", instance.Cloud.InternalDomain))
sb.WriteString(fmt.Sprintf("# address=/%s/<load-balancer-ip>\n", instance.Cloud.Domain))
sb.WriteString("# WARNING: No load balancer IP configured for this instance\n")
sb.WriteString("# DNS entries are commented out until load balancer IP is configured\n\n")
fmt.Fprintf(&sb, "# local=/%s/\n", instance.Cloud.InternalDomain)
fmt.Fprintf(&sb, "# address=/%s/<load-balancer-ip>\n\n", instance.Cloud.InternalDomain)
fmt.Fprintf(&sb, "# address=/%s/<load-balancer-ip>\n", instance.Cloud.Domain)
} else {
// Internal domain (.internal.cloud.example.tld) - local only, no external DNS
sb.WriteString(fmt.Sprintf("# Internal domain (LAN-only)\n"))
sb.WriteString(fmt.Sprintf("local=/%s/\n", instance.Cloud.InternalDomain))
sb.WriteString(fmt.Sprintf("address=/%s/%s\n\n", instance.Cloud.InternalDomain, loadBalancerIP))
sb.WriteString("# Internal domain (LAN-only)\n")
fmt.Fprintf(&sb, "local=/%s/\n", instance.Cloud.InternalDomain)
fmt.Fprintf(&sb, "address=/%s/%s\n\n", instance.Cloud.InternalDomain, loadBalancerIP)
// External domain (cloud.example.tld) - resolve to load balancer IP
sb.WriteString(fmt.Sprintf("# Public domain (resolved locally to avoid external DNS)\n"))
sb.WriteString(fmt.Sprintf("address=/%s/%s\n", instance.Cloud.Domain, loadBalancerIP))
sb.WriteString("# Public domain (resolved locally to avoid external DNS)\n")
fmt.Fprintf(&sb, "address=/%s/%s\n", instance.Cloud.Domain, loadBalancerIP)
}
return sb.String()
@@ -129,7 +129,7 @@ func (g *ConfigGenerator) WriteInstanceConfig(instanceName string, instance conf
return fmt.Errorf("installing instance config: %w", err)
}
log.Printf("Successfully wrote instance DNS config: %s", instanceFile)
slog.Info("wrote instance DNS config", "component", "dnsmasq", "path", instanceFile)
return nil
}
@@ -151,7 +151,9 @@ func (g *ConfigGenerator) ValidateWithInstance(instanceConfigPath string) error
tempMainConfig := filepath.Join(tempDir, "main.conf")
// Modify the conf-dir line to point to our temp instance dir
tempInstanceDir := filepath.Join(tempDir, "instances")
os.MkdirAll(tempInstanceDir, 0755)
if err := os.MkdirAll(tempInstanceDir, 0755); err != nil {
return fmt.Errorf("creating temp instance dir: %w", err)
}
modifiedContent := strings.ReplaceAll(
string(mainContent),
@@ -184,7 +186,7 @@ func (g *ConfigGenerator) RemoveInstanceConfig(instanceName string) error {
// Check if file exists
if _, err := os.Stat(instanceFile); os.IsNotExist(err) {
log.Printf("Instance DNS config does not exist: %s", instanceFile)
slog.Info("instance DNS config does not exist", "component", "dnsmasq", "path", instanceFile)
return nil // Not an error, already removed
}
@@ -193,7 +195,7 @@ func (g *ConfigGenerator) RemoveInstanceConfig(instanceName string) error {
return fmt.Errorf("removing instance config: %w", err)
}
log.Printf("Removed instance DNS config: %s", instanceFile)
slog.Info("removed instance DNS config", "component", "dnsmasq", "path", instanceFile)
return nil
}
@@ -205,16 +207,16 @@ func (g *ConfigGenerator) ReloadService() error {
_, err := cmd.CombinedOutput()
if err != nil {
// If reload fails, try restart as fallback
log.Printf("Reload failed, attempting restart: %v", err)
slog.Error("reload failed, attempting restart", "component", "dnsmasq", "error", err)
return g.RestartService()
}
log.Printf("Successfully reloaded dnsmasq service")
slog.Info("dnsmasq service reloaded", "component", "dnsmasq")
return nil
}
// UpdateToModularConfig migrates from monolithic to modular configuration
func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instanceNames []string, instances []config.InstanceConfig) error {
log.Printf("Migrating to modular dnsmasq configuration...")
slog.Info("migrating to modular configuration", "component", "dnsmasq")
// Ensure instance directory exists
if err := os.MkdirAll(instanceConfigDir, 0755); err != nil {
@@ -225,7 +227,7 @@ func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instan
for i, instance := range instances {
instanceName := instanceNames[i]
if err := g.WriteInstanceConfig(instanceName, instance); err != nil {
log.Printf("Warning: Failed to write instance config for %s: %v", instanceName, err)
slog.Error("failed to write instance config", "component", "dnsmasq", "instance", instanceName, "error", err)
// Continue with other instances
}
}
@@ -255,21 +257,21 @@ func (g *ConfigGenerator) UpdateToModularConfig(cfg *config.GlobalConfig, instan
// Install new config
if err := os.Rename(tempFile, g.configPath); err != nil {
// Try to restore backup
os.Rename(backupFile, g.configPath)
_ = os.Rename(backupFile, g.configPath)
return fmt.Errorf("installing new config: %w", err)
}
// Reload dnsmasq
if err := g.ReloadService(); err != nil {
// Try to restore backup and reload
log.Printf("Reload failed, attempting to restore backup...")
slog.Error("reload failed, restoring backup", "component", "dnsmasq")
os.Remove(g.configPath)
os.Rename(backupFile, g.configPath)
g.ReloadService()
_ = os.Rename(backupFile, g.configPath)
_ = g.ReloadService()
return fmt.Errorf("reloading with new config: %w", err)
}
log.Printf("Successfully migrated to modular dnsmasq configuration")
slog.Info("migrated to modular configuration", "component", "dnsmasq")
return nil
}
@@ -286,6 +288,6 @@ func (g *ConfigGenerator) UpdateInstanceDNS(instanceName string, instance config
return fmt.Errorf("reloading dnsmasq: %w", err)
}
log.Printf("Successfully updated DNS for instance: %s", instanceName)
slog.Info("DNS updated for instance", "component", "dnsmasq", "instance", instanceName)
return nil
}
}

View File

@@ -269,7 +269,7 @@ func ParseVersion(v string) [3]int {
v = v[:idx]
}
var parts [3]int
fmt.Sscanf(v, "%d.%d.%d", &parts[0], &parts[1], &parts[2])
_, _ = fmt.Sscanf(v, "%d.%d.%d", &parts[0], &parts[1], &parts[2])
return parts
}

View File

@@ -2,6 +2,7 @@ package instance
import (
"fmt"
"log/slog"
"os"
"path/filepath"
@@ -74,6 +75,8 @@ func (m *Manager) CreateInstance(name string) error {
return nil
}
slog.Info("creating instance", "component", "instance", "name", name)
// Acquire lock for instance creation
lockPath := tools.GetInstancesLockPath(m.dataDir)
return storage.WithLock(lockPath, func() error {
@@ -118,6 +121,8 @@ func (m *Manager) DeleteInstance(name string) error {
return fmt.Errorf("instance %s does not exist", name)
}
slog.Info("deleting instance", "component", "instance", "name", name)
// Clear context if this is the current instance
currentContext, err := m.contextMgr.GetCurrentContext()
if err == nil && currentContext == name {

View File

@@ -0,0 +1,138 @@
package logging
import (
"context"
"fmt"
"io"
"log/slog"
"slices"
"sync"
)
// ANSI color codes
const (
dim = "\033[2m"
red = "\033[31m"
yellow = "\033[33m"
cyan = "\033[36m"
reset = "\033[0m"
)
// ConsoleHandler formats log output for human readability on terminals.
// It produces compact, color-coded lines:
//
// 20:15:54 INF daemon started addr=:5055
// 20:15:54 ERR backup failed component=backup error="connection refused"
type ConsoleHandler struct {
w io.Writer
level slog.Leveler
attrs []slog.Attr
mu *sync.Mutex
}
// NewConsoleHandler creates a handler that writes human-friendly colored logs.
func NewConsoleHandler(w io.Writer, opts *slog.HandlerOptions) *ConsoleHandler {
level := slog.LevelInfo
if opts != nil && opts.Level != nil {
level = opts.Level.Level()
}
return &ConsoleHandler{
w: w,
level: level,
mu: &sync.Mutex{},
}
}
func (h *ConsoleHandler) Enabled(_ context.Context, level slog.Level) bool {
return level >= h.level.Level()
}
func (h *ConsoleHandler) Handle(_ context.Context, r slog.Record) error {
// Time
buf := []byte(dim + r.Time.Format("15:04:05") + reset + " ")
// Level badge
switch {
case r.Level >= slog.LevelError:
buf = append(buf, red+"ERR"+reset+" "...)
case r.Level >= slog.LevelWarn:
buf = append(buf, yellow+"WRN"+reset+" "...)
default:
buf = append(buf, cyan+"INF"+reset+" "...)
}
// Message
buf = append(buf, r.Message...)
// Pre-set attrs (from slog.With)
for _, a := range h.attrs {
buf = appendAttr(buf, a)
}
// Inline attrs
r.Attrs(func(a slog.Attr) bool {
buf = appendAttr(buf, a)
return true
})
buf = append(buf, '\n')
h.mu.Lock()
defer h.mu.Unlock()
_, err := h.w.Write(buf)
return err
}
func (h *ConsoleHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
return &ConsoleHandler{
w: h.w,
level: h.level,
attrs: append(slices.Clone(h.attrs), attrs...),
mu: h.mu,
}
}
func (h *ConsoleHandler) WithGroup(name string) slog.Handler {
// Groups are rare in this codebase; treat as a prefixed attr set
return &ConsoleHandler{
w: h.w,
level: h.level,
attrs: append(slices.Clone(h.attrs), slog.String("group", name)),
mu: h.mu,
}
}
func appendAttr(buf []byte, a slog.Attr) []byte {
if a.Equal(slog.Attr{}) {
return buf
}
v := a.Value.Resolve()
buf = append(buf, ' ')
buf = append(buf, dim...)
buf = append(buf, a.Key...)
buf = append(buf, '=')
buf = append(buf, reset...)
s := v.String()
if needsQuote(s) {
buf = append(buf, fmt.Sprintf("%q", s)...)
} else {
buf = append(buf, s...)
}
return buf
}
func needsQuote(s string) bool {
if s == "" {
return true
}
for _, c := range s {
if c <= ' ' || c == '"' || c == '\\' {
return true
}
}
return false
}
// Verify interface compliance at compile time.
var _ slog.Handler = (*ConsoleHandler)(nil)

View File

@@ -3,10 +3,12 @@ package node
import (
"context"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/wild-cloud/wild-central/daemon/internal/config"
@@ -172,6 +174,8 @@ func (m *Manager) Get(instanceName, hostname string) (*Node, error) {
// Add registers a new node in config.yaml
func (m *Manager) Add(instanceName string, node *Node) error {
slog.Info("adding node", "component", "node", "instance", instanceName, "hostname", node.Hostname, "role", node.Role)
instancePath := m.GetInstancePath(instanceName)
// Validate node data
@@ -263,6 +267,8 @@ func (m *Manager) Add(instanceName string, node *Node) error {
// Delete removes a node from config.yaml
// If skipReset is false, the node will be reset before deletion (with 30s timeout)
func (m *Manager) Delete(instanceName, nodeIdentifier string, skipReset bool) error {
slog.Info("deleting node", "component", "node", "instance", instanceName, "node", nodeIdentifier, "skipReset", skipReset)
// Get node to find hostname
node, err := m.Get(instanceName, nodeIdentifier)
if err != nil {
@@ -434,6 +440,8 @@ func (m *Manager) Apply(instanceName, nodeIdentifier string, opts ApplyOptions)
return fmt.Errorf("failed to update node status: %w", err)
}
slog.Info("applying node config", "component", "node", "instance", instanceName, "hostname", node.Hostname, "role", node.Role)
// Apply configuration to node
// Determine which IP to use and whether node is in maintenance mode
//
@@ -473,6 +481,7 @@ func (m *Manager) Apply(instanceName, nodeIdentifier string, opts ApplyOptions)
return fmt.Errorf("failed to update node status: %w", err)
}
slog.Info("node config applied", "component", "node", "instance", instanceName, "hostname", node.Hostname, "ip", node.TargetIP)
return nil
}
@@ -723,8 +732,108 @@ func (m *Manager) FetchTemplates(instanceName string) error {
return m.extractEmbeddedTemplates(destDir)
}
// NodeHealth represents the health status of a node
type NodeHealth struct {
Node string `json:"node"`
Services []tools.ServiceStatus `json:"services"`
DmesgErrors []tools.DmesgError `json:"dmesgErrors"`
Healthy bool `json:"healthy"`
}
// Health checks node health by querying Talos service statuses and scanning dmesg for errors
func (m *Manager) Health(instanceName, nodeIdentifier string) (*NodeHealth, error) {
node, err := m.Get(instanceName, nodeIdentifier)
if err != nil {
return nil, fmt.Errorf("node not found: %w", err)
}
if !node.Applied || node.Maintenance {
return nil, fmt.Errorf("health check requires an applied, non-maintenance node")
}
ip := node.TargetIP
if ip == "" {
return nil, fmt.Errorf("no IP address available for node %s", node.Hostname)
}
// Fetch services and dmesg concurrently
var services []tools.ServiceStatus
var dmesgRaw string
var svcErr, dmesgErr error
var wg sync.WaitGroup
wg.Add(2)
go func() {
defer wg.Done()
services, svcErr = m.talosctl.GetServices(ip)
}()
go func() {
defer wg.Done()
dmesgRaw, dmesgErr = m.talosctl.GetDmesg(ip)
}()
wg.Wait()
if svcErr != nil {
return nil, fmt.Errorf("failed to get services: %w", svcErr)
}
var dmesgErrors []tools.DmesgError
if dmesgErr == nil {
dmesgErrors = tools.ParseDmesgErrors(dmesgRaw)
}
if dmesgErrors == nil {
dmesgErrors = []tools.DmesgError{}
}
// Compute overall health
healthy := len(dmesgErrors) == 0
for _, svc := range services {
if !svc.Healthy && svc.HealthMessage != "" {
healthy = false
break
}
}
return &NodeHealth{
Node: node.Hostname,
Services: services,
DmesgErrors: dmesgErrors,
Healthy: healthy,
}, nil
}
// Reboot restarts a node without wiping state
func (m *Manager) Reboot(instanceName, nodeIdentifier string) error {
slog.Info("rebooting node", "component", "node", "instance", instanceName, "node", nodeIdentifier)
node, err := m.Get(instanceName, nodeIdentifier)
if err != nil {
return fmt.Errorf("node not found: %w", err)
}
rebootIP := node.TargetIP
if rebootIP == "" {
rebootIP = node.CurrentIP
}
if rebootIP == "" {
return fmt.Errorf("no IP address available for node %s", node.Hostname)
}
if err := m.talosctl.Reboot(rebootIP); err != nil {
return fmt.Errorf("failed to reboot node: %w", err)
}
slog.Info("node reboot initiated", "component", "node", "instance", instanceName, "hostname", node.Hostname, "ip", rebootIP)
return nil
}
// Reset resets a node to maintenance mode
func (m *Manager) Reset(instanceName, nodeIdentifier string) error {
slog.Info("resetting node", "component", "node", "instance", instanceName, "node", nodeIdentifier)
// Get node
node, err := m.Get(instanceName, nodeIdentifier)
if err != nil {

View File

@@ -3,6 +3,7 @@ package operations
import (
"encoding/json"
"fmt"
"log/slog"
"os"
"path/filepath"
"time"
@@ -72,8 +73,8 @@ type Operation struct {
Progress int `json:"progress"` // 0-100
Details *OperationDetails `json:"details,omitempty"` // Operation-specific details
LogFile string `json:"logFile,omitempty"` // Path to output log file
StartedAt time.Time `json:"started_at"`
EndedAt *time.Time `json:"ended_at,omitempty"`
StartedAt time.Time `json:"started_at"`
EndedAt *time.Time `json:"ended_at,omitempty"`
}
// GetOperationsDir returns the operations directory for an instance
@@ -115,6 +116,8 @@ func (m *Manager) Start(instanceName, opType, target string) (string, error) {
return "", err
}
slog.Info("operation started", "component", "operations", "id", opID, "type", opType, "target", target, "instance", instanceName)
// Broadcast SSE event if manager is available
m.broadcastOperationEvent("operation:started", op)
@@ -164,6 +167,18 @@ func (m *Manager) Update(instanceName, opID, status, message string, progress in
return err
}
// Log terminal status transitions
if oldStatus != status {
switch status {
case "completed":
slog.Info("operation completed", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
case "failed":
slog.Error("operation failed", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName, "message", message)
case "cancelled":
slog.Info("operation cancelled", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
}
}
// Broadcast appropriate SSE event based on status change
if oldStatus != status {
switch status {
@@ -302,6 +317,26 @@ func (m *Manager) Delete(instanceName, opID string) error {
return os.Remove(opPath)
}
// FailOrphaned marks all running/pending operations for an instance as failed.
// Called on API startup to clean up operations that were interrupted by a restart.
func (m *Manager) FailOrphaned(instanceName string) error {
ops, err := m.List(instanceName)
if err != nil {
return err
}
for _, op := range ops {
if op.Status == "running" || op.Status == "pending" {
slog.Info("failing orphaned operation", "component", "operations", "id", op.ID, "type", op.Type, "target", op.Target, "instance", instanceName)
if err := m.Update(instanceName, op.ID, "failed", "API restarted while operation was in progress", op.Progress); err != nil {
slog.Warn("failed to mark orphaned operation as failed", "component", "operations", "id", op.ID, "error", err)
}
}
}
return nil
}
// Cleanup removes old completed/failed operations
func (m *Manager) Cleanup(instanceName string, olderThan time.Duration) error {
ops, err := m.List(instanceName)

View File

@@ -0,0 +1,90 @@
package operations
import (
"os"
"path/filepath"
"testing"
)
func setupTestManager(t *testing.T) (*Manager, string) {
t.Helper()
tmpDir := t.TempDir()
instanceName := "test-cloud"
// Create the instances/test-cloud/operations directory
opsDir := filepath.Join(tmpDir, "instances", instanceName, "operations")
if err := os.MkdirAll(opsDir, 0755); err != nil {
t.Fatalf("failed to create ops dir: %v", err)
}
return NewManager(tmpDir), instanceName
}
func TestFailOrphaned(t *testing.T) {
m, instanceName := setupTestManager(t)
// Create operations in various states
runningID, err := m.Start(instanceName, "backup", "myapp")
if err != nil {
t.Fatalf("failed to start operation: %v", err)
}
_ = m.Update(instanceName, runningID, "running", "Backing up", 50)
pendingID, err := m.Start(instanceName, "restore", "myapp")
if err != nil {
t.Fatalf("failed to start operation: %v", err)
}
completedID, err := m.Start(instanceName, "backup", "otherapp")
if err != nil {
t.Fatalf("failed to start operation: %v", err)
}
_ = m.Update(instanceName, completedID, "completed", "Done", 100)
failedID, err := m.Start(instanceName, "deploy", "otherapp")
if err != nil {
t.Fatalf("failed to start operation: %v", err)
}
_ = m.Update(instanceName, failedID, "failed", "Something broke", 0)
// Run FailOrphaned
if err := m.FailOrphaned(instanceName); err != nil {
t.Fatalf("FailOrphaned failed: %v", err)
}
// Running operation should now be failed
op, _ := m.GetByInstance(instanceName, runningID)
if op.Status != "failed" {
t.Errorf("expected running op to be failed, got %s", op.Status)
}
if op.EndedAt == nil {
t.Error("expected running op to have EndedAt set")
}
// Pending operation should now be failed
op, _ = m.GetByInstance(instanceName, pendingID)
if op.Status != "failed" {
t.Errorf("expected pending op to be failed, got %s", op.Status)
}
// Completed operation should be unchanged
op, _ = m.GetByInstance(instanceName, completedID)
if op.Status != "completed" {
t.Errorf("expected completed op to stay completed, got %s", op.Status)
}
// Failed operation should be unchanged
op, _ = m.GetByInstance(instanceName, failedID)
if op.Status != "failed" {
t.Errorf("expected already-failed op to stay failed, got %s", op.Status)
}
}
func TestFailOrphaned_NoOperations(t *testing.T) {
m, instanceName := setupTestManager(t)
// Should not error on empty operations directory
if err := m.FailOrphaned(instanceName); err != nil {
t.Fatalf("FailOrphaned on empty dir failed: %v", err)
}
}

View File

@@ -4,6 +4,7 @@ import (
"crypto/sha256"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"path/filepath"
@@ -145,6 +146,7 @@ func (m *Manager) DownloadAsset(instanceName, assetType, version, url string) er
return fmt.Errorf("failed to move file: %w", err)
}
slog.Info("PXE asset downloaded", "component", "pxe", "instance", instanceName, "type", assetType, "version", version)
return nil
}

View File

@@ -90,11 +90,8 @@ func TestGenerateSecret_Uniqueness(t *testing.T) {
// Test: NewManager creates manager successfully
func TestNewManager(t *testing.T) {
m := NewManager()
if m == nil {
t.Fatal("NewManager returned nil")
}
if m.yq == nil {
t.Error("Manager.yq is nil")
if m == nil || m.yq == nil {
t.Fatal("NewManager returned nil or Manager.yq is nil")
}
}

View File

@@ -4,7 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"log/slog"
"sync"
"time"
@@ -42,7 +42,6 @@ type EventFilters struct {
// Manager manages all SSE connections
type Manager struct {
clients map[string]map[string]*Client // instanceName -> clientID -> Client
register chan *Client
unregister chan *Client
broadcast chan *Event
mu sync.RWMutex
@@ -53,7 +52,6 @@ type Manager struct {
func NewManager() *Manager {
m := &Manager{
clients: make(map[string]map[string]*Client),
register: make(chan *Client, 100),
unregister: make(chan *Client, 100),
broadcast: make(chan *Event, 1000),
rateLimiters: make(map[string]*rate.Limiter),
@@ -62,19 +60,10 @@ func NewManager() *Manager {
return m
}
// run processes client registration and event broadcasting
// run processes client unregistration and event broadcasting
func (m *Manager) run() {
for {
select {
case client := <-m.register:
m.mu.Lock()
if m.clients[client.InstanceName] == nil {
m.clients[client.InstanceName] = make(map[string]*Client)
}
m.clients[client.InstanceName][client.ID] = client
m.mu.Unlock()
log.Printf("SSE: Client %s registered for instance %s", client.ID, client.InstanceName)
case client := <-m.unregister:
m.mu.Lock()
if clients, ok := m.clients[client.InstanceName]; ok {
@@ -85,7 +74,7 @@ func (m *Manager) run() {
}
close(client.Channel)
m.mu.Unlock()
log.Printf("SSE: Client %s unregistered", client.ID)
slog.Info("client unregistered", "component", "sse", "client", client.ID)
case event := <-m.broadcast:
m.mu.RLock()
@@ -102,7 +91,7 @@ func (m *Manager) run() {
case client.Channel <- event:
default:
// Client channel full, skip
log.Printf("SSE: Client %s channel full, skipping event", client.ID)
slog.Info("client channel full, skipping event", "component", "sse", "client", client.ID)
}
}
}
@@ -114,7 +103,7 @@ func (m *Manager) run() {
case client.Channel <- event:
default:
// Client channel full, skip
log.Printf("SSE: Client %s channel full, skipping event", client.ID)
slog.Info("client channel full, skipping event", "component", "sse", "client", client.ID)
}
}
}
@@ -207,7 +196,14 @@ func (m *Manager) RegisterClient(instanceName string, filters EventFilters) *Cli
Cancel: cancel,
}
m.register <- client
m.mu.Lock()
if m.clients[instanceName] == nil {
m.clients[instanceName] = make(map[string]*Client)
}
m.clients[instanceName][client.ID] = client
m.mu.Unlock()
slog.Info("client registered", "component", "sse", "client", client.ID, "instance", instanceName)
return client
}
@@ -230,7 +226,7 @@ func (m *Manager) Broadcast(event *Event) {
select {
case m.broadcast <- event:
default:
log.Printf("SSE: Broadcast channel full, dropping event %s", event.ID)
slog.Error("broadcast channel full, dropping event", "component", "sse", "event", event.ID)
}
}
@@ -269,4 +265,4 @@ func generateEventID() string {
// JSON marshals the event to JSON
func (e *Event) JSON() ([]byte, error) {
return json.Marshal(e)
}
}

View File

@@ -349,4 +349,4 @@ func BenchmarkBroadcast(b *testing.B) {
for _, client := range clients {
manager.UnregisterClient(client)
}
}
}

View File

@@ -5,7 +5,7 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"log/slog"
"os/exec"
"strings"
"sync"
@@ -120,7 +120,7 @@ func (w *KubectlWatcher) Start() error {
w.wg.Add(1)
go w.watchResource("services", w.parseServiceEvent)
log.Printf("SSE: Started kubectl watchers for instance %s", w.instanceName)
slog.Info("started kubectl watchers", "component", "sse", "instance", w.instanceName)
return nil
}
@@ -148,13 +148,13 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Printf("SSE: Failed to create stdout pipe for %s watch: %v", resourceType, err)
slog.Error("failed to create stdout pipe", "component", "sse", "resource", resourceType, "error", err)
w.handleWatchError(resourceType)
continue
}
if err := cmd.Start(); err != nil {
log.Printf("SSE: Failed to start %s watch: %v", resourceType, err)
slog.Error("failed to start watch", "component", "sse", "resource", resourceType, "error", err)
w.handleWatchError(resourceType)
continue
}
@@ -170,14 +170,14 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
}
if err := scanner.Err(); err != nil {
log.Printf("SSE: %s watch scanner error: %v", resourceType, err)
slog.Error("watch scanner error", "component", "sse", "resource", resourceType, "error", err)
}
cmd.Wait()
_ = cmd.Wait()
// If context not cancelled, restart after a delay
if w.ctx.Err() == nil {
log.Printf("SSE: Restarting %s watcher for instance %s", resourceType, w.instanceName)
slog.Info("restarting watcher", "component", "sse", "resource", resourceType, "instance", w.instanceName)
time.Sleep(5 * time.Second)
}
}
@@ -186,7 +186,7 @@ func (w *KubectlWatcher) watchResource(resourceType string, parser func([]byte,
// parsePodEvent parses pod watch events
func (w *KubectlWatcher) parsePodEvent(data []byte, resourceType string) {
var event struct {
Type string `json:"type"` // ADDED, MODIFIED, DELETED
Type string `json:"type"` // ADDED, MODIFIED, DELETED
Object struct {
Metadata struct {
Name string `json:"name"`
@@ -503,7 +503,7 @@ func (w *KubectlWatcher) handleWatchError(resourceType string) {
func (w *KubectlWatcher) Stop() {
w.cancel()
w.wg.Wait()
log.Printf("SSE: Stopped kubectl watchers for instance %s", w.instanceName)
slog.Info("stopped kubectl watchers", "component", "sse", "instance", w.instanceName)
}
// TalosWatcher watches Talos events using talosctl
@@ -532,7 +532,7 @@ func NewTalosWatcher(instanceName, talosconfig, nodeIP string, manager *Manager)
// Start begins watching Talos events
func (w *TalosWatcher) Start() error {
go w.watchEvents()
log.Printf("SSE: Started talos watcher for instance %s", w.instanceName)
slog.Info("started talos watcher", "component", "sse", "instance", w.instanceName)
return nil
}
@@ -557,13 +557,13 @@ func (w *TalosWatcher) watchEvents() {
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Printf("SSE: Failed to create stdout pipe for Talos events: %v", err)
slog.Error("failed to create stdout pipe for talos events", "component", "sse", "error", err)
time.Sleep(10 * time.Second)
continue
}
if err := cmd.Start(); err != nil {
log.Printf("SSE: Failed to start Talos event watch: %v", err)
slog.Error("failed to start talos event watch", "component", "sse", "error", err)
time.Sleep(10 * time.Second)
continue
}
@@ -599,11 +599,11 @@ func (w *TalosWatcher) watchEvents() {
}
}
cmd.Wait()
_ = cmd.Wait()
// If context not cancelled, restart after a delay
if w.ctx.Err() == nil {
log.Printf("SSE: Restarting talos watcher for instance %s", w.instanceName)
slog.Info("restarting talos watcher", "component", "sse", "instance", w.instanceName)
time.Sleep(10 * time.Second)
}
}
@@ -612,5 +612,5 @@ func (w *TalosWatcher) watchEvents() {
// Stop stops the watcher
func (w *TalosWatcher) Stop() {
w.cancel()
log.Printf("SSE: Stopped talos watcher for instance %s", w.instanceName)
}
slog.Info("stopped talos watcher", "component", "sse", "instance", w.instanceName)
}

View File

@@ -431,4 +431,4 @@ func BenchmarkJSONParsing(b *testing.B) {
for i := 0; i < b.N; i++ {
watcher.parsePodEvent([]byte(podJSON), "test-instance")
}
}
}

View File

@@ -25,8 +25,7 @@ func TestNewKubectl(t *testing.T) {
k := NewKubectl(tt.kubeconfigPath)
if k == nil {
t.Fatal("NewKubectl() returned nil")
}
if k.kubeconfigPath != tt.kubeconfigPath {
} else if k.kubeconfigPath != tt.kubeconfigPath {
t.Errorf("kubeconfigPath = %q, want %q", k.kubeconfigPath, tt.kubeconfigPath)
}
})
@@ -209,9 +208,8 @@ func TestKubectlGetDeployment(t *testing.T) {
if err == nil {
if depInfo == nil {
t.Fatal("GetDeployment() returned nil without error")
}
// Desired should be non-negative
if depInfo.Desired < 0 {
} else if depInfo.Desired < 0 {
// Desired should be non-negative
t.Errorf("Desired = %d, should be non-negative", depInfo.Desired)
}
}
@@ -244,19 +242,19 @@ func TestKubectlGetReplicas(t *testing.T) {
if err == nil {
if replicaInfo == nil {
t.Fatal("GetReplicas() returned nil without error")
}
// All values should be non-negative
if replicaInfo.Desired < 0 {
t.Error("Desired < 0")
}
if replicaInfo.Current < 0 {
t.Error("Current < 0")
}
if replicaInfo.Ready < 0 {
t.Error("Ready < 0")
}
if replicaInfo.Available < 0 {
t.Error("Available < 0")
} else {
if replicaInfo.Desired < 0 {
t.Error("Desired < 0")
}
if replicaInfo.Current < 0 {
t.Error("Current < 0")
}
if replicaInfo.Ready < 0 {
t.Error("Ready < 0")
}
if replicaInfo.Available < 0 {
t.Error("Available < 0")
}
}
}
})
@@ -775,4 +773,3 @@ func TestKubectlGetPodsByLabel(t *testing.T) {
})
}
}

View File

@@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"os/exec"
"regexp"
"runtime"
"strings"
"time"
@@ -385,6 +386,30 @@ func (t *Talosctl) Upgrade(nodeIP, image string, preserve bool) error {
return nil
}
// Reboot reboots a node. The node restarts without wiping state.
func (t *Talosctl) Reboot(nodeIP string) error {
args := t.buildArgs([]string{
"reboot",
"--nodes", nodeIP,
})
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
cmd := exec.CommandContext(ctx, "talosctl", args...)
output, err := cmd.CombinedOutput()
if err != nil {
outputStr := string(output)
// Connection errors are expected — the node is rebooting
if strings.Contains(outputStr, "connection refused") || strings.Contains(outputStr, "Unavailable") || strings.Contains(outputStr, "EOF") {
return nil
}
return fmt.Errorf("talosctl reboot failed: %w\nOutput: %s", err, outputStr)
}
return nil
}
// Rollback reverts a node to its previous Talos version.
// Talos uses an A/B image scheme, so rollback restores the previous boot image.
func (t *Talosctl) Rollback(nodeIP string) error {
@@ -462,3 +487,172 @@ func GetClientInfo() (*ClientInfo, error) {
Arch: arch,
}, nil
}
// ServiceStatus represents the health status of a Talos service
type ServiceStatus struct {
ID string `json:"id"`
State string `json:"state"`
Healthy bool `json:"healthy"`
HealthMessage string `json:"healthMessage"`
}
// DmesgError represents a critical error found in kernel messages
type DmesgError struct {
Severity string `json:"severity"`
Message string `json:"message"`
Timestamp string `json:"timestamp"`
}
// GetServices queries Talos service statuses from a node
func (t *Talosctl) GetServices(nodeIP string) ([]ServiceStatus, error) {
args := t.buildArgs([]string{
"service",
"--nodes", nodeIP,
})
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "talosctl", args...)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("talosctl service failed: %w\nOutput: %s", err, string(output))
}
return ParseServiceOutput(string(output)), nil
}
// ParseServiceOutput parses the tabular output of `talosctl service`
func ParseServiceOutput(output string) []ServiceStatus {
var services []ServiceStatus
lines := strings.Split(output, "\n")
for _, line := range lines {
// Skip header and empty lines
if line == "" || strings.HasPrefix(line, "NODE") {
continue
}
fields := strings.Fields(line)
// Format: NODE SERVICE STATE HEALTH LAST_CHANGE LAST_CHANGE_UNIT LAST_EVENT...
// Minimum: node + service + state + health + last_change + unit = 6 fields
if len(fields) < 6 {
continue
}
id := fields[1]
state := fields[2]
health := fields[3]
// Extract last event (everything after the time fields)
// Fields 4 and 5 are "Xm Ys ago", event starts at field 6+
var healthMessage string
if len(fields) > 6 {
healthMessage = strings.Join(fields[6:], " ")
}
services = append(services, ServiceStatus{
ID: id,
State: state,
Healthy: health == "OK",
HealthMessage: healthMessage,
})
}
return services
}
// GetDmesg retrieves kernel messages from a node
func (t *Talosctl) GetDmesg(nodeIP string) (string, error) {
args := t.buildArgs([]string{
"dmesg",
"--nodes", nodeIP,
})
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "talosctl", args...)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("talosctl dmesg failed: %w\nOutput: %s", err, string(output))
}
return string(output), nil
}
// dmesg error patterns for hardware issues
var dmesgErrorPatterns = []string{
"I/O error",
"Medium Error",
"failed command:",
"auto reallocate failed",
"memory error",
"machine check",
"ECC error",
}
var ataErrorRegex = regexp.MustCompile(`(?i)ata\d+.*error`)
// ParseDmesgErrors scans dmesg output for critical hardware errors
func ParseDmesgErrors(raw string) []DmesgError {
var errors []DmesgError
seen := make(map[string]bool)
for _, line := range strings.Split(raw, "\n") {
if line == "" {
continue
}
matched := false
lower := strings.ToLower(line)
for _, pattern := range dmesgErrorPatterns {
if strings.Contains(lower, strings.ToLower(pattern)) {
matched = true
break
}
}
if !matched && ataErrorRegex.MatchString(line) {
matched = true
}
if !matched {
continue
}
// Extract timestamp: format is "IP: facility: level: [TIMESTAMP]: message"
timestamp, message := parseDmesgLine(line)
// Deduplicate identical messages
if seen[message] {
continue
}
seen[message] = true
errors = append(errors, DmesgError{
Severity: "error",
Message: message,
Timestamp: timestamp,
})
}
return errors
}
// parseDmesgLine extracts timestamp and message from a talosctl dmesg line
// Format: "192.168.8.32: kern: err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb..."
func parseDmesgLine(line string) (timestamp, message string) {
// Find timestamp in brackets
start := strings.Index(line, "[")
end := strings.Index(line, "]:")
if start >= 0 && end > start {
timestamp = line[start+1 : end]
// Message is everything after "]: "
if end+2 < len(line) {
message = strings.TrimSpace(line[end+2:])
}
}
if message == "" {
message = line
}
return
}

View File

@@ -11,8 +11,7 @@ func TestNewTalosctl(t *testing.T) {
tc := NewTalosctl()
if tc == nil {
t.Fatal("NewTalosctl() returned nil")
}
if tc.talosconfigPath != "" {
} else if tc.talosconfigPath != "" {
t.Error("talosconfigPath should be empty for NewTalosctl()")
}
})
@@ -22,8 +21,7 @@ func TestNewTalosctl(t *testing.T) {
tc := NewTalosconfigWithConfig(configPath)
if tc == nil {
t.Fatal("NewTalosconfigWithConfig() returned nil")
}
if tc.talosconfigPath != configPath {
} else if tc.talosconfigPath != configPath {
t.Errorf("talosconfigPath = %q, want %q", tc.talosconfigPath, configPath)
}
})
@@ -433,9 +431,9 @@ Server:
want: "v1.11.5",
},
{
name: "fallback to Talos line when no Tag present",
name: "fallback to Talos line when no Tag present",
output: `Talos v1.12.0`,
want: "v1.12.0",
want: "v1.12.0",
},
}
@@ -619,6 +617,183 @@ func TestGetClientInfo(t *testing.T) {
}
}
func TestParseServiceOutput(t *testing.T) {
tests := []struct {
name string
output string
wantLen int
checkSvc func(t *testing.T, services []ServiceStatus)
}{
{
name: "healthy node",
output: `NODE SERVICE STATE HEALTH LAST CHANGE LAST EVENT
192.168.8.33 apid Running OK 172h15m25s ago Health check successful
192.168.8.33 etcd Running OK 172h14m56s ago Health check successful
192.168.8.33 kubelet Running OK 172h15m16s ago Health check successful`,
wantLen: 3,
checkSvc: func(t *testing.T, services []ServiceStatus) {
for _, svc := range services {
if !svc.Healthy {
t.Errorf("service %s should be healthy", svc.ID)
}
if svc.State != "Running" {
t.Errorf("service %s state = %q, want Running", svc.ID, svc.State)
}
}
},
},
{
name: "unhealthy etcd",
output: `NODE SERVICE STATE HEALTH LAST CHANGE LAST EVENT
192.168.8.32 etcd Running Fail 42m14s ago Health check failed: context deadline exceeded
192.168.8.32 kubelet Running OK 37m42s ago Health check successful`,
wantLen: 2,
checkSvc: func(t *testing.T, services []ServiceStatus) {
for _, svc := range services {
if svc.ID == "etcd" {
if svc.Healthy {
t.Error("etcd should be unhealthy")
}
if svc.HealthMessage != "Health check failed: context deadline exceeded" {
t.Errorf("etcd health message = %q", svc.HealthMessage)
}
}
if svc.ID == "kubelet" && !svc.Healthy {
t.Error("kubelet should be healthy")
}
}
},
},
{
name: "services with unknown health",
output: `NODE SERVICE STATE HEALTH LAST CHANGE LAST EVENT
192.168.8.32 dashboard Running ? 42m47s ago Process Process(["/sbin/dashboard"]) started with PID 2237`,
wantLen: 1,
checkSvc: func(t *testing.T, services []ServiceStatus) {
if services[0].Healthy {
t.Error("service with ? health should not be marked healthy")
}
},
},
{
name: "empty output",
output: "",
wantLen: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
services := ParseServiceOutput(tt.output)
if len(services) != tt.wantLen {
t.Errorf("ParseServiceOutput() returned %d services, want %d", len(services), tt.wantLen)
return
}
if tt.checkSvc != nil {
tt.checkSvc(t, services)
}
})
}
}
func TestParseDmesgErrors(t *testing.T) {
tests := []struct {
name string
input string
wantLen int
check func(t *testing.T, errors []DmesgError)
}{
{
name: "disk I/O errors",
input: `192.168.8.32: kern: err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848
192.168.8.32: kern: info: [2026-05-25T07:12:06.040Z]: sd 1:0:0:0: [sdb] Sense Key : Medium Error [current]
192.168.8.32: kern: err: [2026-05-25T07:12:10.886Z]: ata1.00: failed command: READ FPDMA QUEUED
192.168.8.32: kern: info: [2026-05-25T07:12:14.072Z]: sd 1:0:0:0: Add. Sense: Unrecovered read error - auto reallocate failed`,
wantLen: 4,
check: func(t *testing.T, errors []DmesgError) {
if errors[0].Timestamp != "2026-05-25T07:12:06.034Z" {
t.Errorf("timestamp = %q", errors[0].Timestamp)
}
if errors[0].Severity != "error" {
t.Errorf("severity = %q, want error", errors[0].Severity)
}
},
},
{
name: "ata error pattern",
input: `192.168.8.32: kern: err: [2026-05-25T07:12:06.034Z]: ata1.00: error: { UNC }`,
wantLen: 1,
},
{
name: "no errors in normal output",
input: `192.168.8.32: kern: info: [2026-05-25T07:11:00.000Z]: Linux version 6.18.24-talos
192.168.8.32: kern: info: [2026-05-25T07:11:00.100Z]: Command line: init_on_alloc=1
192.168.8.32: kern: info: [2026-05-25T07:11:01.000Z]: sdb: sdb1 sdb2 sdb3 sdb4`,
wantLen: 0,
},
{
name: "empty input",
input: "",
wantLen: 0,
},
{
name: "deduplicates identical messages",
input: `192.168.8.32: kern: err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848
192.168.8.32: kern: err: [2026-05-25T07:12:10.034Z]: I/O error, dev sdb, sector 4873848`,
wantLen: 1,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
errors := ParseDmesgErrors(tt.input)
if len(errors) != tt.wantLen {
t.Errorf("ParseDmesgErrors() returned %d errors, want %d", len(errors), tt.wantLen)
for _, e := range errors {
t.Logf(" error: %s", e.Message)
}
return
}
if tt.check != nil {
tt.check(t, errors)
}
})
}
}
func TestParseDmesgLine(t *testing.T) {
tests := []struct {
name string
line string
wantTimestamp string
wantMessage string
}{
{
name: "standard talos dmesg format",
line: "192.168.8.32: kern: err: [2026-05-25T07:12:06.034Z]: I/O error, dev sdb, sector 4873848",
wantTimestamp: "2026-05-25T07:12:06.034Z",
wantMessage: "I/O error, dev sdb, sector 4873848",
},
{
name: "line without brackets",
line: "some plain log line",
wantMessage: "some plain log line",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ts, msg := parseDmesgLine(tt.line)
if ts != tt.wantTimestamp {
t.Errorf("timestamp = %q, want %q", ts, tt.wantTimestamp)
}
if msg != tt.wantMessage {
t.Errorf("message = %q, want %q", msg, tt.wantMessage)
}
})
}
}
// Helper function for interface filtering test
func containsAny(s string, substrs []string) bool {
for _, substr := range substrs {

View File

@@ -12,8 +12,7 @@ func TestNewYQ(t *testing.T) {
yq := NewYQ()
if yq == nil {
t.Fatal("NewYQ() returned nil")
}
if yq.yqPath == "" {
} else if yq.yqPath == "" {
t.Error("yqPath should not be empty")
}
})

View File

@@ -2,7 +2,7 @@ package main
import (
"fmt"
"log"
"log/slog"
"net/http"
"os"
"os/signal"
@@ -16,6 +16,8 @@ import (
v1 "github.com/wild-cloud/wild-central/daemon/internal/api/v1"
"github.com/wild-cloud/wild-central/daemon/internal/backup"
"github.com/wild-cloud/wild-central/daemon/internal/instance"
"github.com/wild-cloud/wild-central/daemon/internal/logging"
"github.com/wild-cloud/wild-central/daemon/internal/operations"
)
var startTime time.Time
@@ -33,6 +35,11 @@ func splitAndTrim(s string, sep string) []string {
}
func main() {
// Initialize structured logging
slog.SetDefault(slog.New(logging.NewConsoleHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
// Record start time
startTime = time.Now()
@@ -45,27 +52,31 @@ func main() {
// Get apps directory from environment or use default
appsDir := os.Getenv("WILD_DIRECTORY")
if appsDir == "" {
// Default apps directory
appsDir = "/opt/wild-cloud/apps"
log.Printf("WILD_DIRECTORY not set, using default apps directory: %s", appsDir)
} else {
// If WILD_DIRECTORY is set, use it as-is for backward compatibility
// (it might point to the old directory structure with apps/ subdirectory)
log.Printf("Using WILD_DIRECTORY for apps: %s", appsDir)
}
slog.Info("configured directories", "dataDir", dataDir, "appsDir", appsDir)
// Create API handler with all dependencies
api, err := v1.NewAPI(dataDir, appsDir)
if err != nil {
log.Fatalf("Failed to initialize API: %v", err)
slog.Error("failed to initialize API", "error", err)
os.Exit(1)
}
// Fail any operations left running from a previous API process
instanceMgr := instance.NewManager(dataDir)
opsMgr := operations.NewManager(dataDir)
if instances, err := instanceMgr.ListInstances(); err == nil {
for _, name := range instances {
if err := opsMgr.FailOrphaned(name); err != nil {
slog.Warn("failed to clean orphaned operations", "instance", name, "error", err)
}
}
}
// Start central status SSE broadcaster
api.StartCentralStatusBroadcaster(startTime)
log.Println("Central status broadcaster started")
// Start backup scheduler
instanceMgr := instance.NewManager(dataDir)
slog.Info("central status broadcaster started")
scheduler := backup.NewScheduler(dataDir, instanceMgr)
scheduler.Start()
@@ -89,9 +100,8 @@ func main() {
var allowedOrigins []string
if corsOrigins := os.Getenv("WILD_CORS_ORIGINS"); corsOrigins != "" {
// Use explicitly configured origins
allowedOrigins = splitAndTrim(corsOrigins, ",")
log.Printf("CORS configured with explicit origins: %v", allowedOrigins)
slog.Info("CORS configured with explicit origins", "origins", allowedOrigins)
} else {
// Auto-detect origins based on hostname
allowedOrigins = []string{
@@ -116,7 +126,7 @@ func main() {
fmt.Sprintf("http://%s:5173", hostname),
fmt.Sprintf("http://%s:5174", hostname),
)
log.Printf("Added hostname-based CORS origins for: %s", hostname)
slog.Info("added hostname-based CORS origins", "hostname", hostname)
}
// Add development server ports
@@ -129,7 +139,7 @@ func main() {
"http://127.0.0.1:3000",
)
log.Printf("CORS configured with auto-detected origins: %v", allowedOrigins)
slog.Info("CORS configured with auto-detected origins", "count", len(allowedOrigins))
}
corsHandler := cors.New(cors.Options{
@@ -163,9 +173,7 @@ func main() {
port := 5055
addr := fmt.Sprintf("%s:%d", host, port)
log.Printf("Starting wild-central daemon on %s", addr)
log.Printf("Data directory: %s", dataDir)
log.Printf("Apps directory: %s", appsDir)
slog.Info("daemon started", "addr", addr)
// Set up signal handling for graceful shutdown
sigChan := make(chan os.Signal, 1)
@@ -174,13 +182,14 @@ func main() {
// Start HTTP server in goroutine
go func() {
if err := http.ListenAndServe(addr, handler); err != nil {
log.Fatal("Server failed to start:", err)
slog.Error("server failed to start", "error", err)
os.Exit(1)
}
}()
// Wait for shutdown signal
<-sigChan
log.Println("Shutting down gracefully...")
slog.Info("shutting down")
scheduler.Stop()
log.Println("Shutdown complete")
slog.Info("shutdown complete")
}

View File

@@ -160,6 +160,19 @@ api_put() {
rm -f "$tmpfile"
}
# Makes a PATCH request. Sets HTTP_CODE and RESP globals.
api_patch() {
local path="$1"
local body="$2"
local tmpfile
tmpfile=$(mktemp)
HTTP_CODE=$(curl -s -w '%{http_code}' -o "$tmpfile" \
-X PATCH -H "Content-Type: application/json" \
-d "$body" "${API_URL}${path}")
RESP=$(cat "$tmpfile")
rm -f "$tmpfile"
}
# Makes a DELETE request. Sets HTTP_CODE and RESP globals.
api_delete() {
local path="$1"

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env bash
# Test: Config changes and drift detection
# Verifies: PATCH config, compilation drift detected, compile clears drift, deploy succeeds
# Idempotent: restores original config at end
# Note: Uses db.name (not storage) because PVC storage can only expand, never shrink
APP_CONFIG_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
APP_ENHANCED_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
DB_INIT_FILE="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/db-init-job.yaml"
# --- Read current config and capture original db.name ---
test_start "Config: Read current config"
api_get "$APP_CONFIG_PATH"
assert_http "200" "GET app config should return 200"
ORIGINAL_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
if [[ -z "$ORIGINAL_DB_NAME" ]]; then
ORIGINAL_DB_NAME="e2e_test_app"
fi
# --- PATCH config: change db.name ---
test_start "Config: PATCH db.name to e2e_drift_test"
api_patch "$APP_CONFIG_PATH" '{"config":{"db":{"name":"e2e_drift_test"}}}'
assert_http "200" "PATCH config should return 200"
test_start "Config: Verify config changed"
api_get "$APP_CONFIG_PATH"
NEW_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
assert_eq "$NEW_DB_NAME" "e2e_drift_test" "db.name should be e2e_drift_test after PATCH"
# --- Check drift: config changed but not recompiled ---
test_start "Config: Drift detected after config change"
api_get "$APP_ENHANCED_PATH"
COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
assert_eq "$COMP_DRIFTED" "true" "Compilation drift should be detected"
# --- Compile to clear compilation drift ---
test_start "Config: Compile clears compilation drift"
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
assert_http "200" "Compile should return 200"
test_start "Config: Verify no compilation drift after compile"
api_get "$APP_ENHANCED_PATH"
COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
assert_eq "$COMP_DRIFTED" "false" "Compilation drift should be cleared after compile"
# --- Verify compiled db-init-job.yaml has new db name ---
test_start "Config: Compiled db-init-job.yaml has e2e_drift_test"
if grep -q "e2e_drift_test" "$DB_INIT_FILE" 2>/dev/null; then
test_pass
else
test_fail "db-init-job.yaml should contain e2e_drift_test after compile"
fi
# --- Cleanup: restore original db.name, recompile, deploy ---
echo " Restoring original db.name (${ORIGINAL_DB_NAME})..."
api_patch "$APP_CONFIG_PATH" "{\"config\":{\"db\":{\"name\":\"${ORIGINAL_DB_NAME}\"}}}"
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
test_start "Config: Deploy with restored config"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed when restoring original config"
fi
test_start "Config: Pods ready after restore"
if wait_for_pods "$APP_NAME" 120; then
test_pass
else
test_fail "Pods not ready after config restore deploy"
fi

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env bash
# Test: Fetch from Wild Directory and redeploy
# Verifies: fetch re-copies package from source, deploy succeeds after fetch
# Idempotent: leaves app in same state as before
MANIFEST_PATH="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/manifest.yaml"
PACKAGE_DIR="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/.package"
# --- Record current version ---
test_start "Fetch: Record current version"
CURRENT_VERSION=$(grep '^version:' "$MANIFEST_PATH" 2>/dev/null | head -1 | awk '{print $2}')
assert_not_empty "$CURRENT_VERSION" "Should have a current version in manifest"
# --- Fetch from Wild Directory ---
test_start "Fetch: Re-fetch from source"
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/fetch"
assert_http "200" "Fetch should return 200"
# --- Verify .package directory exists (source backup) ---
test_start "Fetch: .package directory exists after fetch"
if [[ -d "$PACKAGE_DIR" ]]; then
test_pass
else
test_fail ".package directory should exist after fetch"
fi
# --- Verify manifest version still present ---
test_start "Fetch: Version preserved after fetch"
AFTER_VERSION=$(grep '^version:' "$MANIFEST_PATH" 2>/dev/null | head -1 | awk '{print $2}')
assert_not_empty "$AFTER_VERSION" "Version should still be present after fetch"
# --- Check source drift is cleared ---
test_start "Fetch: No source drift after fetch"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
SRC_DRIFTED=$(echo "$RESP" | jq -r '.drift.source.drifted // false' 2>/dev/null)
assert_eq "$SRC_DRIFTED" "false" "Source drift should be false after fresh fetch"
# --- Deploy after fetch ---
test_start "Fetch: Deploy after fetch"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed after fetch"
fi
test_start "Fetch: Pods ready after deploy"
if wait_for_pods "$APP_NAME" 120; then
test_pass
else
test_fail "Pods not ready after fetch+deploy"
fi
# --- Verify status OK ---
test_start "Fetch: Status OK after fetch+deploy"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
assert_http "200" "Status should return 200 after fetch+deploy"

View File

@@ -0,0 +1,71 @@
#!/usr/bin/env bash
# Test: App dependencies — add with explicit requiredAppMappings, verify resolution
# Verifies: dependency mappings resolve correctly, secrets from deps are present
# Idempotent: deletes and re-adds app, leaves it deployed
MANIFEST_PATH="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/manifest.yaml"
# --- Delete existing app to test fresh add with mappings ---
echo " Deleting ${APP_NAME} to test dependency add..."
start_async_delete_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}" "$DELETE_TIMEOUT" || true
# Wait for namespace to fully terminate
WAIT=0
while (( WAIT < 60 )); do
NS_STATUS=$($KC get namespace "$APP_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
if [[ "$NS_STATUS" == "NotFound" ]]; then
break
fi
sleep 5
WAIT=$((WAIT + 5))
done
# --- Add with explicit dependency mapping ---
test_start "Deps: Add app with requiredAppMappings"
api_post "/api/v1/instances/${INSTANCE}/apps" \
"{\"name\":\"${APP_NAME}\",\"requiredAppMappings\":{\"postgres\":\"postgres\"}}"
assert_http_one_of "200 201" "Add with mappings should succeed"
# --- Verify manifest has installedAs ---
test_start "Deps: Manifest has installedAs for postgres"
if grep -q "installedAs: postgres" "$MANIFEST_PATH" 2>/dev/null; then
test_pass
else
test_fail "manifest.yaml should have installedAs: postgres"
fi
# --- Verify config has db.host referencing postgres ---
test_start "Deps: Config has db.host"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
DB_HOST=$(echo "$RESP" | jq -r '.db.host // empty' 2>/dev/null)
assert_not_empty "$DB_HOST" "db.host should be set from postgres dependency"
# --- Deploy ---
test_start "Deps: Deploy app with dependencies"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed with dependency mappings"
fi
test_start "Deps: Pods ready"
if wait_for_pods "$APP_NAME" 120; then
test_pass
else
test_fail "Pods not ready after dep deploy"
fi
# --- Verify K8s secret has postgres.password from dependency ---
test_start "Deps: K8s secret has postgres.password key"
SECRET_KEYS=$($KC get secret "${APP_NAME}-secrets" -n "$APP_NAME" -o jsonpath='{.data}' 2>/dev/null)
if echo "$SECRET_KEYS" | grep -q "postgres.password" 2>/dev/null; then
test_pass
else
test_fail "K8s secret should contain postgres.password from dependency"
fi

View File

@@ -0,0 +1,71 @@
#!/usr/bin/env bash
# Test: Secrets rotation — change a secret, redeploy, verify in cluster
# Verifies: PUT secrets, redeploy applies new secret to K8s
# Idempotent: restores original secret at end
SECRETS_PATH="/api/v1/instances/${INSTANCE}/secrets"
# --- Read current secrets ---
test_start "Secrets: Read raw secrets"
api_get "${SECRETS_PATH}?raw=true"
assert_http "200" "GET raw secrets should return 200"
ORIGINAL_SECRETS="$RESP"
ORIGINAL_PASSWORD=$(echo "$RESP" | jq -r '.apps."e2e-test-app".dbPassword // empty' 2>/dev/null)
test_start "Secrets: Has dbPassword for e2e-test-app"
assert_not_empty "$ORIGINAL_PASSWORD" "Should have a dbPassword for e2e-test-app"
# --- Generate and set new password ---
NEW_PASSWORD="e2e-rotated-$(date +%s)"
test_start "Secrets: Rotate dbPassword"
# Build modified secrets document with jq
MODIFIED_SECRETS=$(echo "$ORIGINAL_SECRETS" | jq --arg pw "$NEW_PASSWORD" \
'.apps."e2e-test-app".dbPassword = $pw')
api_put "${SECRETS_PATH}" "$MODIFIED_SECRETS"
assert_http "200" "PUT secrets should return 200"
# --- Verify secret stored ---
test_start "Secrets: Verify new password stored"
api_get "${SECRETS_PATH}?raw=true"
STORED_PASSWORD=$(echo "$RESP" | jq -r '.apps."e2e-test-app".dbPassword // empty' 2>/dev/null)
assert_eq "$STORED_PASSWORD" "$NEW_PASSWORD" "Stored password should match rotated value"
# --- Compile and deploy to push new secret to cluster ---
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
test_start "Secrets: Deploy after rotation"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed after secret rotation"
fi
wait_for_pods "$APP_NAME" 120 || true
# --- Verify K8s secret updated ---
test_start "Secrets: K8s secret has rotated password"
K8S_PASSWORD=$($KC get secret "${APP_NAME}-secrets" -n "$APP_NAME" \
-o jsonpath='{.data.dbPassword}' 2>/dev/null | base64 -d 2>/dev/null)
assert_eq "$K8S_PASSWORD" "$NEW_PASSWORD" "K8s secret should have the rotated password"
# --- Cleanup: restore original secrets ---
echo " Restoring original secrets..."
api_put "${SECRETS_PATH}" "$ORIGINAL_SECRETS"
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
test_start "Secrets: Deploy with restored secrets"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed when restoring original secrets"
fi
wait_for_pods "$APP_NAME" 120 || true

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env bash
# Test: Dependency config propagation — change db.name, recompile, verify in manifests
# Verifies: config change propagates to compiled templates after compile
# Idempotent: restores original config at end
APP_CONFIG_PATH="/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
# --- Read current db.name ---
test_start "DepConfig: Read current db.name"
api_get "$APP_CONFIG_PATH"
assert_http "200" "GET app config should return 200"
ORIGINAL_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
if [[ -z "$ORIGINAL_DB_NAME" ]]; then
ORIGINAL_DB_NAME="e2e_test_app"
fi
# --- PATCH db.name to a new value ---
test_start "DepConfig: PATCH db.name"
api_patch "$APP_CONFIG_PATH" '{"config":{"db":{"name":"e2e_test_app_v2"}}}'
assert_http "200" "PATCH db.name should return 200"
test_start "DepConfig: Verify db.name changed"
api_get "$APP_CONFIG_PATH"
NEW_DB_NAME=$(echo "$RESP" | jq -r '.db.name // empty' 2>/dev/null)
assert_eq "$NEW_DB_NAME" "e2e_test_app_v2" "db.name should be e2e_test_app_v2"
# --- Compile ---
test_start "DepConfig: Compile after config change"
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
assert_http "200" "Compile should return 200"
# --- Verify compiled db-init-job.yaml has new db name ---
test_start "DepConfig: Compiled db-init-job.yaml has e2e_test_app_v2"
DB_INIT_FILE="${DATA_DIR}/instances/${INSTANCE}/apps/${APP_NAME}/db-init-job.yaml"
if grep -q "e2e_test_app_v2" "$DB_INIT_FILE" 2>/dev/null; then
test_pass
else
test_fail "db-init-job.yaml should contain e2e_test_app_v2 after compile"
fi
# --- Check no compilation drift (we just compiled) ---
test_start "DepConfig: No compilation drift after compile"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/enhanced"
COMP_DRIFTED=$(echo "$RESP" | jq -r '.drift.compilation.drifted // false' 2>/dev/null)
assert_eq "$COMP_DRIFTED" "false" "No compilation drift expected right after compile"
# --- Cleanup: restore original db.name, recompile, deploy ---
echo " Restoring original db.name (${ORIGINAL_DB_NAME})..."
api_patch "$APP_CONFIG_PATH" "{\"config\":{\"db\":{\"name\":\"${ORIGINAL_DB_NAME}\"}}}"
api_post "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/compile"
test_start "DepConfig: Deploy with restored config"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed when restoring original config"
fi
wait_for_pods "$APP_NAME" 120 || true

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env bash
# Test: Delete and re-add round trip
# Verifies: full lifecycle — delete, verify gone, re-add, deploy, verify working
# Idempotent: leaves app deployed for subsequent tests
# --- Verify app is currently deployed ---
test_start "DeleteReadd: App is deployed"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
assert_http "200" "App should be deployed before delete test"
# --- Delete ---
test_start "DeleteReadd: Delete app"
if start_async_delete_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}" "$DELETE_TIMEOUT"; then
test_pass
else
test_fail "Delete failed"
fi
# --- Wait for namespace gone ---
echo " Waiting for namespace cleanup..."
WAIT=0
while (( WAIT < 60 )); do
NS_STATUS=$($KC get namespace "$APP_NAME" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
if [[ "$NS_STATUS" == "NotFound" ]]; then
break
fi
sleep 5
WAIT=$((WAIT + 5))
done
# --- Verify app is gone ---
test_start "DeleteReadd: App gone after delete"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
if [[ "$HTTP_CODE" == "404" || "$HTTP_CODE" == "500" ]]; then
test_pass
elif [[ "$HTTP_CODE" == "200" ]]; then
APP_STATUS=$(echo "$RESP" | jq -r '.status // empty' 2>/dev/null)
if [[ "$APP_STATUS" == "not-added" || "$APP_STATUS" == "not-deployed" ]]; then
test_pass
else
test_fail "App still appears as deployed after delete (status: ${APP_STATUS})"
fi
else
test_fail "Unexpected HTTP ${HTTP_CODE}"
fi
# --- Re-add ---
test_start "DeleteReadd: Re-add app"
api_post "/api/v1/instances/${INSTANCE}/apps" "{\"name\":\"${APP_NAME}\"}"
assert_http_one_of "200 201" "Re-add should succeed"
# --- Verify config written ---
test_start "DeleteReadd: Config exists after re-add"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/config"
APP_NAMESPACE=$(echo "$RESP" | jq -r '.namespace // empty' 2>/dev/null)
assert_eq "$APP_NAMESPACE" "e2e-test-app" "Config namespace should be set after re-add"
# --- Deploy ---
test_start "DeleteReadd: Deploy after re-add"
if start_async_and_wait "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/deploy" "" "$DEPLOY_TIMEOUT"; then
test_pass
else
test_fail "Deploy failed after re-add"
fi
test_start "DeleteReadd: Pods ready after re-add deploy"
if wait_for_pods "$APP_NAME" 120; then
test_pass
else
test_fail "Pods not ready after re-add deploy"
fi
# --- Verify status OK ---
test_start "DeleteReadd: Status OK after re-add"
api_get "/api/v1/instances/${INSTANCE}/apps/${APP_NAME}/status"
assert_http "200" "Status should return 200 after re-add"

View File

@@ -544,6 +544,110 @@ var nodeDeleteCmd = &cobra.Command{
},
}
var nodeHealthCmd = &cobra.Command{
Use: "health <hostname>",
Short: "Check node health",
Long: `Check the health of a node by querying Talos service statuses
and scanning kernel messages for hardware errors.
Examples:
wild node health control-1
wild node health worker-2`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
inst, err := getInstanceName()
if err != nil {
return err
}
resp, err := apiClient.Get(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/health", inst, args[0]))
if err != nil {
return err
}
// Print human-readable output
healthy, _ := resp.Data["healthy"].(bool)
if healthy {
fmt.Printf("Node: %s — HEALTHY\n", args[0])
} else {
fmt.Printf("Node: %s — UNHEALTHY\n", args[0])
}
// Print services
if services, ok := resp.Data["services"].([]interface{}); ok && len(services) > 0 {
fmt.Println("\nServices:")
for _, s := range services {
svc, ok := s.(map[string]interface{})
if !ok {
continue
}
id, _ := svc["id"].(string)
state, _ := svc["state"].(string)
svcHealthy, _ := svc["healthy"].(bool)
msg, _ := svc["healthMessage"].(string)
status := "OK"
if !svcHealthy && msg != "" {
status = "FAIL"
} else if !svcHealthy {
status = "?"
}
if msg != "" {
fmt.Printf(" %-14s %-10s %-6s %s\n", id, state, status, msg)
} else {
fmt.Printf(" %-14s %-10s %s\n", id, state, status)
}
}
}
// Print dmesg errors
if errors, ok := resp.Data["dmesgErrors"].([]interface{}); ok && len(errors) > 0 {
fmt.Printf("\nDmesg Errors (%d):\n", len(errors))
for _, e := range errors {
entry, ok := e.(map[string]interface{})
if !ok {
continue
}
ts, _ := entry["timestamp"].(string)
msg, _ := entry["message"].(string)
if ts != "" {
fmt.Printf(" [%s] %s\n", ts, msg)
} else {
fmt.Printf(" %s\n", msg)
}
}
}
return nil
},
}
var nodeRebootCmd = &cobra.Command{
Use: "reboot <hostname>",
Short: "Reboot a node",
Long: `Reboot a node without wiping state. The node will restart and
rejoin the cluster automatically. Running workloads on this node will be interrupted.
Examples:
wild node reboot control-1
wild node reboot worker-2`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
inst, err := getInstanceName()
if err != nil {
return err
}
_, err = apiClient.Post(fmt.Sprintf("/api/v1/instances/%s/nodes/%s/reboot", inst, args[0]), nil)
if err != nil {
return err
}
fmt.Printf("Reboot initiated for %s\n", args[0])
return nil
},
}
var nodeUpgradeCmd = &cobra.Command{
Use: "upgrade <hostname> <version>",
Short: "Upgrade a node to a new Talos version",
@@ -626,6 +730,8 @@ func init() {
nodeCmd.AddCommand(nodeUpdateCmd)
nodeCmd.AddCommand(nodeFetchTemplatesCmd)
nodeCmd.AddCommand(nodeDeleteCmd)
nodeCmd.AddCommand(nodeHealthCmd)
nodeCmd.AddCommand(nodeRebootCmd)
nodeCmd.AddCommand(nodeUpgradeCmd)
nodeCmd.AddCommand(nodeRollbackCmd)

View File

@@ -4,33 +4,101 @@ Verifying every item on this list confirms the full networking stack is function
## Node Layer
1. **All nodes Ready** — no cordons, no taints (e.g. `maintenance:NoExecute`)
1. **All nodes Ready** — no cordons, no taints (e.g., `maintenance:NoExecute`)
```bash
kubectl get nodes
wild node list
```
2. **Flannel pods running on every node** — stale VXLAN tunnels break cross-node pod traffic
```bash
kubectl get pods -n kube-system -l app=flannel -o wide
```
3. **Cross-node pod connectivity** — pods on each worker can reach pods on every other node
## Service Routing
4. **kube-proxy pods running on every node** — nftables rules route ClusterIP traffic to pod endpoints
```bash
kubectl get pods -n kube-system -l k8s-app=kube-proxy -o wide
```
5. **CoreDNS pods running and resolving** — both cluster-internal names (`*.svc.cluster.local`) and external names
```bash
kubectl get pods -n kube-system -l k8s-app=kube-dns
```
6. **CoreDNS upstream reachability** — Talos DNS proxy at `169.254.116.108` responding from all nodes
## Load Balancing
7. **MetalLB speakers running on all nodes** — L2 ARP announcements for LoadBalancer IPs
```bash
kubectl get pods -n metallb-system -l component=speaker -o wide
```
8. **MetalLB ServiceL2Status resources valid** — `status.node` matches actual pod placement (stale entries block announcements)
```bash
kubectl get servicel2statuses.metallb.io -n metallb-system
```
9. **LoadBalancer IPs reachable** — Traefik LB IP responds from LAN
```bash
kubectl get svc -n traefik
curl -k https://<traefik-lb-ip>
```
## Ingress & Security
10. **Traefik ingress routing** — forwards to backend services, TLS termination working
```bash
kubectl get pods -n traefik
kubectl logs -n traefik -l app=traefik | tail -20
```
11. **CrowdSec LAPI running** — can reach `api.crowdsec.net` (depends on CoreDNS external resolution)
```bash
kubectl get pods -n crowdsec
```
12. **CrowdSec bouncer registered with LAPI** — unregistered bouncer blocks all forwardAuth requests
```bash
wild service logs crowdsec | grep bouncer
```
## Storage
13. **Longhorn managers running on all workers** — enables volume replica scheduling and rebuilds
```bash
kubectl get pods -n longhorn-system -l app=longhorn-manager -o wide
```
14. **Longhorn volume replicas healthy** — all volumes at target replica count across nodes
```bash
kubectl get volumes.longhorn.io -n longhorn-system
```
## External DNS & Certificates
15. **ExternalDNS pod running** — creating and updating DNS records at Cloudflare
```bash
kubectl get pods -n externaldns
```
16. **cert-manager pods running** — issuing and renewing TLS certificates
```bash
kubectl get pods -n cert-manager
kubectl get certificates -n cert-manager
```
## LAN DNS
15. **dnsmasq on Wild Central** — resolves LAN-local domains to correct LoadBalancer IPs (hairpin NAT)
17. **dnsmasq on Wild Central** — resolves LAN-local domains to correct LoadBalancer IPs (hairpin NAT)
```bash
wild dns status
```
## Quick Full Check
Run `wild cluster health` for an automated check of the most critical items. For a comprehensive check, walk through each item above.

View File

@@ -0,0 +1,246 @@
# Disaster Recovery
This guide covers recovering a Wild Cloud cluster after catastrophic failure — hardware death, corrupted storage, or any scenario where you need to rebuild from scratch.
## What You Need
To rebuild a cluster you need two things:
1. **Cluster config backup** — The tar.gz archive from Wild Cloud's cluster config backup feature, containing kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos node configs.
2. **App backups** — The per-app backup archives (database dumps, PVC snapshots, config files) stored at your backup destination (S3, NFS, or local).
If your instance data directory was a git repository (recommended), you also have the full history of compiled manifests and config.yaml in git. The git repo alone is enough to redeploy apps — but without secrets.yaml and kubeconfig, you can't authenticate to the cluster or decrypt app secrets.
## Recovery Scenarios
### Scenario 1: Wild Central Device Failure (Cluster Intact)
The Raspberry Pi or server running Wild Central died, but the Kubernetes cluster nodes are still running.
**Steps:**
1. **Set up a new Wild Central device**:
```bash
sudo dpkg -i wild-cloud-central_*.deb
sudo systemctl enable wild-cloud-central
```
2. **Restore your data directory** from git (for manifests and config) plus your cluster config backup (for secrets and credentials):
```bash
# Clone instance data from git
git clone https://your-git-server/wild-cloud-data.git /var/lib/wild-central
# Extract cluster config backup over the top
# This restores kubeconfig, secrets.yaml, talosconfig, etc.
tar -xzf cluster-config-backup.tar.gz -C /var/lib/wild-central/instances/your-instance/
```
3. **Start Wild Central**:
```bash
sudo systemctl start wild-cloud-central
```
4. **Verify connectivity**:
```bash
wild instance use your-instance
wild cluster status
```
The cluster is still running — your apps are live. Wild Central is just the management plane.
### Scenario 2: Single Node Failure (Cluster Degraded)
One or more nodes died but the cluster still has quorum (at least 2 of 3 control plane nodes, or workers are replaceable).
**Steps:**
1. **Check cluster health** from Wild Central:
```bash
talosctl --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
health --nodes <surviving-node-ip>
```
2. **Remove the dead node** from the cluster:
```bash
# Remove from Kubernetes
kubectl --kubeconfig /var/lib/wild-central/instances/your-instance/kubeconfig \
delete node <dead-node-name>
# Remove from etcd (if control plane node)
talosctl --talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
etcd remove-member <dead-node-name> --nodes <surviving-node-ip>
```
3. **PXE boot a replacement node** using Wild Central's PXE service, or manually install Talos Linux on the new hardware.
4. **Add the new node** through the Wild Cloud web UI or CLI:
```bash
wild node add --role worker --ip <new-node-ip>
```
5. **Verify workloads reschedule** to the new node:
```bash
kubectl get pods --all-namespaces -o wide
```
### Scenario 3: Total Cluster Loss (Rebuild from Scratch)
All nodes are gone. You need to rebuild everything.
**Prerequisites:**
- New hardware (or repaired existing hardware) with network boot capability or Talos Linux installed
- Your cluster config backup (tar.gz with kubeconfig, talosconfig, secrets.yaml, Talos configs)
- Access to your backup destination (S3 bucket, NFS share, etc.)
- Your instance data git repo (if available — contains compiled manifests)
**Steps:**
1. **Set up Wild Central** on a fresh device:
```bash
sudo dpkg -i wild-cloud-central_*.deb
```
2. **Restore your data directory**:
```bash
# If you have a git repo:
git clone https://your-git-server/wild-cloud-data.git /var/lib/wild-central
# Extract cluster config over the top:
tar -xzf cluster-config-backup.tar.gz -C /var/lib/wild-central/instances/your-instance/
```
If you don't have a git repo, just extract the cluster config backup into a fresh instance directory. You'll re-add apps from the Wild Directory.
3. **Bootstrap new Talos nodes** using the restored Talos configs:
```bash
# Apply control plane config to the first node
talosctl apply-config \
--talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
--nodes <node-ip> \
--file /var/lib/wild-central/instances/your-instance/talos/generated/controlplane.yaml \
--insecure
```
The restored `controlplane.yaml` and `worker.yaml` contain your cluster's identity (cluster name, secrets, certificates). Using them ensures the new cluster has the same identity as the old one.
4. **Bootstrap the cluster**:
```bash
talosctl bootstrap \
--talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
--nodes <first-control-plane-ip>
```
5. **Wait for the cluster to be healthy**:
```bash
talosctl health \
--talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
--nodes <first-control-plane-ip>
```
6. **Update kubeconfig** (the new cluster may issue a fresh kubeconfig):
```bash
talosctl kubeconfig \
--talosconfig /var/lib/wild-central/instances/your-instance/talos/generated/talosconfig \
--nodes <first-control-plane-ip> \
/var/lib/wild-central/instances/your-instance/kubeconfig
```
7. **Deploy infrastructure services first** (order matters):
```bash
wild instance use your-instance
wild service install metallb
wild service install traefik
wild service install cert-manager
wild service install external-dns
wild service install longhorn # If using Longhorn for PVCs
```
8. **Deploy apps** (dependencies first, then apps):
```bash
# Deploy database services first
wild app deploy pg
wild app deploy redis
# Then deploy apps
wild app deploy gitea
wild app deploy immich
# ... etc
```
If your git repo has compiled manifests, these deploys apply the exact same manifests that were running before. If not, you'll need to re-add apps from the Wild Directory first:
```bash
wild app add gitea
wild app deploy gitea
```
9. **Restore app data from backups**:
```bash
# Restore each app's data (database + PVC) from the backup destination
# Use the Web UI: navigate to Backups > [app] > Restore
# Or via CLI:
wild restore gitea --auto
wild restore immich --auto
```
The `--auto` flag runs the full blue-green restore cycle: restore to standby, switch traffic, then clean up the old namespace. For more control, run each phase separately — see [Restoring Backups](restoring-backups.md).
10. **Verify everything is working**:
```bash
wild app status gitea
wild app status immich
kubectl get pods --all-namespaces
```
## Cluster Config Backup
The cluster config backup feature archives the files that are NOT tracked in git — the credentials and secrets needed to access the cluster.
### What Gets Backed Up
| File | Purpose |
|------|---------|
| `kubeconfig` | Kubernetes API credentials |
| `config.yaml` | Full instance configuration |
| `secrets.yaml` | App secrets (database passwords, API keys) |
| `talos/generated/talosconfig` | Talos API credentials |
| `talos/generated/controlplane.yaml` | Control plane node config |
| `talos/generated/worker.yaml` | Worker node config |
| `talos/generated/secrets.yaml` | Talos bootstrap secrets (cluster identity) |
### Creating Cluster Config Backups
**Web UI:** Navigate to Backups, click "Backup" on the "Cluster Config" row.
**CLI:**
```bash
# Via API
curl -X POST http://localhost:5055/api/v1/instances/your-instance/backup/cluster
```
**Scheduled:** Create a backup schedule with target type "cluster" to automatically back up cluster config on a recurring basis. See [Making Backups](making-backups.md) for scheduling details.
### Downloading a Cluster Config Backup
Cluster config backups are stored at your configured backup destination under the key `cluster-config/{instance}/{timestamp}.tar.gz`. To retrieve one:
- **S3/Azure:** Download from the bucket/container using your cloud provider's CLI
- **NFS:** Navigate to the NFS mount point and find the archive
- **Local:** Find it at `{data-dir}/instances/{instance}/backups/cluster-config/...`
Store a copy of the latest cluster config backup in a secure offsite location (encrypted USB drive, password manager, separate cloud storage). If your primary backup destination is on the cluster itself, a total cluster loss takes the backups with it.
## Prevention Checklist
- [ ] **Cluster config backups** are scheduled and running
- [ ] **App backups** are scheduled for all critical apps
- [ ] **Backup destination** is offsite or on separate infrastructure from the cluster
- [ ] **Instance data directory** is pushed to a git remote (excludes secrets.yaml)
- [ ] **Cluster config backup archive** is stored in a second location (not just on the cluster)
- [ ] **Test a restore** periodically — backups are worthless if restore doesn't work
## Related Guides
- [Making Backups](making-backups.md) — Setting up backup destinations and schedules
- [Restoring Backups](restoring-backups.md) — Blue-green restore process in detail
- [Upgrade Talos](upgrade-talos.md) — Talos node upgrade and rollback
- [Troubleshoot Cluster](troubleshoot-cluster.md) — Diagnosing cluster issues after recovery

View File

@@ -1,265 +1,250 @@
# Making Backups
This guide covers how to create backups of your wild-cloud infrastructure using the integrated backup system.
This guide covers how to create backups of your Wild Cloud applications and cluster configuration.
## Overview
The wild-cloud backup system creates encrypted, deduplicated snapshots using restic. It backs up three main components:
Wild Cloud's backup system creates backups using native tools for each data type:
- **Applications**: Database dumps and persistent volume data
- **Cluster**: Kubernetes resources and etcd state
- **Configuration**: Wild-cloud repository and settings
- **PostgreSQL databases**: `pg_dump` in custom compressed format
- **MySQL databases**: `mysqldump` with gzip compression
- **Persistent volumes**: Longhorn native backup API
- **Configuration**: tar.gz archives of manifests, config, and secrets
Backups are stored at a configured destination (S3, Azure Blob, NFS, or local filesystem) and tracked via recovery plans that coordinate the full backup-restore lifecycle.
## Prerequisites
Before making backups, ensure you have:
1. **Environment configured**: Run `source env.sh` to load backup configuration
2. **Restic repository**: Backup repository configured in `config.yaml`
3. **Backup password**: Set in wild-cloud secrets
4. **Staging directory**: Configured path for temporary backup files
1. **A backup destination configured** — S3 bucket, Azure container, NFS share, or local path
2. **Longhorn backup target** configured if backing up persistent volumes
3. **kubectl access** to your cluster
## Backup Components
## Configuring Backup Destination
### Applications (`wild-app-backup`)
### Web UI
Backs up individual applications including:
- **Database dumps**: PostgreSQL/MySQL databases in compressed custom format
- **PVC data**: Application files streamed directly for restic deduplication
- **Auto-discovery**: Finds databases and PVCs based on app manifest.yaml
Navigate to **Backups** and click **Settings** to configure your backup destination and retention policy.
### Cluster Resources (`wild-backup --cluster-only`)
### CLI
Backs up cluster-wide resources:
- **Kubernetes resources**: All pods, services, deployments, secrets, configmaps
- **Storage definitions**: PersistentVolumes, PVCs, StorageClasses
- **etcd snapshot**: Complete cluster state for disaster recovery
Backup configuration is stored in your instance's `config.yaml` under the `backup:` section. Credentials are stored in `secrets.yaml`.
### Configuration (`wild-backup --home-only`)
Example configuration:
Backs up wild-cloud configuration:
- **Repository contents**: All app definitions, manifests, configurations
- **Settings**: Wild-cloud configuration files and customizations
```yaml
# config.yaml
backup:
destination:
type: "s3" # "s3", "azure", "nfs", or "local"
s3:
bucket: "my-backups"
region: "us-east-1"
endpoint: "minio.example.com" # Optional, for S3-compatible services
retention:
daily: 7
weekly: 4
monthly: 6
yearly: 1
```
```yaml
# secrets.yaml
backup:
s3:
accessKeyId: "..."
secretAccessKey: "..."
```
### Supported Destinations
| Destination | Config Fields | Notes |
|-------------|--------------|-------|
| **local** | `path` | Default: `instances/{instance}/backups` |
| **s3** | `bucket`, `region`, `endpoint`, `accessKeyId`, `secretAccessKey` | Supports S3-compatible services like MinIO |
| **azure** | `container`, `storageAccount`, `accessKey` | Azure Blob Storage |
| **nfs** | `server`, `path`, `mountPoint`, `mountOptions` | Auto-recovers stale mounts |
## Making Backups
### Full System Backup (Recommended)
### Single App Backup
Create a complete backup of everything:
**Web UI:** Navigate to **Backups > [app]** and click **Backup Now**.
**CLI:**
```bash
# Backup a single app
wild backup start gitea
# Shorthand
wild backup gitea
```
### All Apps Backup
**CLI:**
```bash
# Backup all deployed apps
wild backup all
```
### Cluster Config Backup
Cluster config backups archive the credentials and secrets not tracked in git — kubeconfig, talosconfig, config.yaml, secrets.yaml, and Talos generated configs.
**Web UI:** Navigate to **Backups** and click **Backup** on the Cluster Config row.
**CLI / API:**
```bash
curl -X POST http://localhost:5055/api/v1/instances/{instance}/backup/cluster
```
## What Gets Backed Up
### Application Backups
The backup system auto-discovers what to back up based on each app's manifest:
| Component | Tool | Format | Storage Key |
|-----------|------|--------|-------------|
| PostgreSQL database | `pg_dump` | Custom binary (compression level 9) + globals SQL | `postgres/{instance}/{app}/{timestamp}.dump` |
| MySQL database | `mysqldump` | Gzip-compressed SQL | `mysql/{instance}/{app}/{timestamp}.sql.gz` |
| Persistent volumes | Longhorn native API | Longhorn backup format | Stored in Longhorn backup target |
| App config & manifests | tar + gzip | tar.gz archive | `config/{instance}/{app}/{timestamp}.tar.gz` |
Cache volumes (names containing `-cache` or `-tmp`) and cache databases (Redis, Memcached) are automatically excluded.
### Cluster Config Backups
| File | Purpose |
|------|---------|
| `kubeconfig` | Kubernetes API credentials |
| `config.yaml` | Full instance configuration |
| `secrets.yaml` | App secrets (database passwords, API keys) |
| `talos/generated/talosconfig` | Talos API credentials |
| `talos/generated/controlplane.yaml` | Control plane node config |
| `talos/generated/worker.yaml` | Worker node config |
| `talos/generated/secrets.yaml` | Talos bootstrap secrets (cluster identity) |
**Storage key:** `cluster-config/{instance}/{timestamp}.tar.gz`
## Discovering Backup Resources
Before backing up for the first time, you can discover what persistent data an app has:
```bash
# Backup all components (apps + cluster + config)
wild-backup
wild backup discover gitea
```
This is equivalent to:
This analyzes the app's manifest and kustomize resources to find databases and PVCs, showing what will be backed up and what will be skipped.
## Scheduled Backups
### Creating a Schedule
**Web UI:** Navigate to **Backups > [app]** and click **Schedule**.
**CLI:**
```bash
wild-backup --home --apps --cluster
# Daily backup at 2 AM
wild backup schedule create gitea --frequency daily --time 02:00
# Weekly backup on Sunday at 3 AM
wild backup schedule create gitea --frequency weekly --time 03:00 --day-of-week 0
# Monthly backup on the 1st at midnight
wild backup schedule create gitea --frequency monthly --time 00:00 --day-of-month 1
```
### Selective Backups
#### Applications Only
```bash
# All applications
wild-backup --apps-only
# Single application
wild-app-backup discourse
# Multiple applications
wild-app-backup discourse gitea immich
```
#### Cluster Only
```bash
# Kubernetes resources + etcd
wild-backup --cluster-only
```
#### Configuration Only
```bash
# Wild-cloud repository
wild-backup --home-only
```
### Excluding Components
Skip specific components:
### Managing Schedules
```bash
# Skip config, backup apps + cluster
wild-backup --no-home
# List all schedules
wild backup schedule list
# Skip applications, backup config + cluster
wild-backup --no-apps
# Enable/disable a schedule
wild backup schedule enable <schedule-id>
wild backup schedule disable <schedule-id>
# Skip cluster resources, backup config + apps
wild-backup --no-cluster
# Manually trigger a schedule
wild backup schedule run <schedule-id>
# Delete a schedule
wild backup schedule delete <schedule-id>
```
## Backup Process Details
Retention is enforced automatically after each scheduled backup completes.
### Application Backup Process
1. **Discovery**: Parses `manifest.yaml` to find database and PVC dependencies
2. **Database backup**: Creates compressed custom-format dumps
3. **PVC backup**: Streams files directly to staging for restic deduplication
4. **Staging**: Organizes files in clean directory structure
5. **Upload**: Creates individual restic snapshots per application
### Cluster Backup Process
1. **Resource export**: Exports all Kubernetes resources to YAML
2. **etcd snapshot**: Creates point-in-time etcd backup via talosctl
3. **Upload**: Creates single restic snapshot for cluster state
### Restic Snapshots
Each backup creates tagged restic snapshots:
## Listing and Verifying Backups
```bash
# View all snapshots
restic snapshots
# List backups for an app
wild backup list gitea
# Filter by component
restic snapshots --tag discourse # Specific app
restic snapshots --tag cluster # Cluster resources
restic snapshots --tag wc-home # Wild-cloud config
# Verify a backup can be restored
wild backup verify gitea
# Verify a specific backup
wild backup verify gitea 20250314T021530Z
```
## Where Backup Files Are Staged
Before uploading to your restic repository, backup files are organized in a staging directory. This temporary area lets you see exactly what's being backed up and helps with deduplication.
Here's what the staging area looks like:
```
backup-staging/
├── apps/
│ ├── discourse/
│ │ ├── database_20250816T120000Z.dump
│ │ ├── globals_20250816T120000Z.sql
│ │ └── discourse/
│ │ └── data/ # All the actual files
│ ├── gitea/
│ │ ├── database_20250816T120000Z.dump
│ │ └── gitea-data/
│ │ └── data/ # Git repositories, etc.
│ └── immich/
│ ├── database_20250816T120000Z.dump
│ └── immich-data/
│ └── upload/ # Photos and videos
└── cluster/
├── all-resources.yaml # All running services
├── secrets.yaml # Passwords and certificates
├── configmaps.yaml # Configuration data
└── etcd-snapshot.db # Complete cluster state
```
This staging approach means you can examine backup contents before they're uploaded, and restic can efficiently deduplicate files that haven't changed.
## Advanced Usage
### Custom Backup Scripts
Applications can provide custom backup logic:
## Deleting Backups
```bash
# Create apps/myapp/backup.sh for custom behavior
chmod +x apps/myapp/backup.sh
# Delete a specific backup
wild backup delete gitea 20250314T021530Z
# wild-app-backup will use custom script if present
wild-app-backup myapp
# Skip confirmation
wild backup delete gitea 20250314T021530Z --yes
```
### Monitoring Backup Status
## Backup Health
Check the overall health of your backup system:
**Web UI:** The **Backups** page shows a health summary across all apps — backup count, last backup time, scheduled status, and total size.
**API:**
```bash
# Check recent snapshots
restic snapshots | head -20
# Check specific app backups
restic snapshots --tag discourse
# Verify backup integrity
restic check
curl http://localhost:5055/api/v1/instances/{instance}/backup/health
```
### Backup Automation
## Recovery Plans
Set up automated backups with cron:
Each backup creates a recovery plan (`recovery-plan.yaml`) that tracks the backup's contents and coordinates restore operations. The plan records what strategies were used, where data is stored, and the current lifecycle status.
```bash
# Daily full backup at 2 AM
0 2 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup
# Hourly app backups during business hours
0 9-17 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup --apps-only
```
## Performance Considerations
### Large PVCs (like Immich photos)
The streaming backup approach provides:
- **First backup**: Full transfer time (all files processed)
- **Subsequent backups**: Only changed files processed (dramatically faster)
- **Storage efficiency**: Restic deduplication reduces storage usage
### Network Usage
- **Database dumps**: Compressed at source, efficient transfer
- **PVC data**: Uncompressed transfer, but restic handles deduplication
- **etcd snapshots**: Small files, minimal impact
Plan statuses progress through: `backing_up` -> `backed_up` -> (restore phases when used).
## Troubleshooting
### Common Issues
### "No databases or PVCs found"
- The app has no database dependencies in its `manifest.yaml`
- No PVCs with matching labels exist in the app namespace
- Run `wild backup discover <app>` to see what's detected
**"No databases or PVCs found"**
- App has no `manifest.yaml` with database dependencies
- No PVCs with matching labels in app namespace
- Create custom `backup.sh` script for special cases
### Longhorn backup fails
- Verify Longhorn backup target is configured (`kubectl get settings -n longhorn-system backup-target`)
- Check Longhorn manager pods are running on all worker nodes
- Ensure sufficient storage at the backup target
**"kubectl not found"**
- Ensure kubectl is installed and configured
- Check cluster connectivity with `kubectl get nodes`
### Database dump fails
- Verify the database pod is running: `kubectl get pods -n postgres`
- Check that the database name in `config.yaml` matches the actual database
**"Staging directory not set"**
- Configure `cloud.backup.staging` in `config.yaml`
- Ensure directory exists and is writable
**"Could not create etcd backup"**
- Ensure `talosctl` is installed for Talos clusters
- Check control plane node connectivity
- Verify etcd pods are accessible in kube-system namespace
### Backup Verification
Always verify backups periodically:
```bash
# Check restic repository integrity
restic check
# List recent snapshots
restic snapshots --compact
# Test restore to different directory
restic restore latest --target /tmp/restore-test
```
### Scheduled backups not running
- Verify the schedule is enabled: `wild backup schedule list`
- Check the Wild Central API is running: `wild daemon status`
## Security Notes
- **Encryption**: All backups are encrypted with your backup password
- **Secrets**: Kubernetes secrets are included in cluster backups
- **Access control**: Secure your backup repository and passwords
- **Network**: Consider bandwidth usage for large initial backups
- **Encryption**: S3 and Azure destinations support server-side encryption. Configure bucket/container encryption policies at your cloud provider.
- **Secrets**: Database credentials and API keys are included in cluster config backups. Store these backups securely.
- **Access control**: Restrict access to your backup destination. Cluster config backups contain everything needed to access your cluster.
## Next Steps
- [Restoring Backups](restoring-backups.md) - Learn how to restore from backups
- Configure automated backup schedules
- Set up backup monitoring and alerting
- Test disaster recovery procedures
- [Restoring Backups](restoring-backups.md) Learn how to restore from backups using blue-green deployment
- [Disaster Recovery](disaster-recovery.md) — Full cluster rebuild procedures
- Set up scheduled backups for all critical apps
- Store cluster config backups in a second location (not on the cluster itself)

View File

@@ -1,50 +1,209 @@
# System Health Monitoring
## Basic Monitoring
This guide covers how to monitor the health of your Wild Cloud cluster, nodes, and applications.
Check system health with:
## Dashboard Overview
The Wild Cloud web app dashboard provides an at-a-glance view of your cluster:
- Cluster health status with individual health checks
- Node count and status (control plane and worker)
- Kubernetes and Talos versions
- Running operations summary
- Active app count
Navigate to your instance's **Dashboard** page for this overview.
## Cluster Health
### Web UI
The **Dashboard** page runs automated health checks covering:
- Control plane readiness
- Worker node readiness
- etcd health
- Networking health
- Storage health
Each check shows pass/fail status with detailed messages.
### CLI
```bash
# Quick cluster health check
wild cluster health
# Cluster status overview
wild cluster status
# Check overall system health
wild health
```
### API
```bash
# Detailed health checks
curl http://localhost:5055/api/v1/instances/{instance}/cluster/health
# Cluster status
curl http://localhost:5055/api/v1/instances/{instance}/cluster/status
```
## Node Monitoring
### Web UI
The **Cluster** page shows all nodes with:
- Status indicators (Ready, NotReady, maintenance)
- Role (control plane / worker)
- Hardware info (CPU, memory, storage)
- Talos version
- Current and target IP addresses
### CLI
```bash
# List all nodes with status
wild node list
# Detailed node info
wild node show <hostname>
```
### kubectl
```bash
# Node resource usage
kubectl top nodes
# Pod resource usage
kubectl top pods -A
# Persistent volume claims
kubectl get pvc -A
# Node status and conditions
kubectl get nodes -o wide
kubectl describe node <node-name>
```
## Advanced Monitoring (Future Implementation)
## Application Monitoring
Consider implementing:
### Web UI
1. **Prometheus + Grafana** for comprehensive monitoring:
```bash
# Placeholder for future implementation
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace
```
The **Apps > Installed** page shows all deployed apps with real-time status (running, unhealthy, no-pods, error). Click an app for detailed information including pod status, resource usage, and logs.
2. **Loki** for log aggregation:
```bash
# Placeholder for future implementation
helm repo add grafana https://grafana.github.io/helm-charts
helm install loki grafana/loki-stack --namespace logging --create-namespace
```
### CLI
## Additional Resources
```bash
# List deployed apps with status
wild app list-deployed
This document will be expanded in the future with:
# Detailed app status
wild app status <app>
- Detailed backup and restore procedures
- Monitoring setup instructions
- Comprehensive security hardening guide
- Automated maintenance scripts
# View app logs
wild service logs <app> --follow
```
For now, refer to the following external resources:
### kubectl
- [K3s Documentation](https://docs.k3s.io/)
- [Kubernetes Troubleshooting Guide](https://kubernetes.io/docs/tasks/debug/)
- [Velero Backup Documentation](https://velero.io/docs/latest/)
- [Kubernetes Security Best Practices](https://kubernetes.io/docs/concepts/security/)
```bash
# Pod resource usage across all namespaces
kubectl top pods -A
# Pods not in Running/Completed state
kubectl get pods -A | grep -v "Running\|Completed"
# Events for a specific app
kubectl get events -n <app-namespace> --sort-by='.lastTimestamp'
```
## Backup Health
### Web UI
The **Backups** page shows a health summary across all apps:
- Total backup count and size
- Last backup time for each app
- Whether scheduled backups are configured
- Failed backup indicators
### CLI / API
```bash
# Backup health overview
curl http://localhost:5055/api/v1/instances/{instance}/backup/health
```
## Operations Monitoring
Long-running operations (deployments, backups, restores, node upgrades) are tracked by the operations system.
### Web UI
The **Operations** page shows all operations with filtering by status (running, completed, failed) and real-time progress updates.
### CLI
```bash
# List recent operations
wild operation list
# Check a specific operation
wild operation get <operation-id>
```
## Storage Monitoring
### Persistent Volumes
```bash
# Check PVC status and usage across all namespaces
kubectl get pvc -A
# Longhorn volume health
kubectl get volumes.longhorn.io -n longhorn-system
```
### Longhorn Dashboard
If Headlamp is installed, you can access the Kubernetes dashboard for detailed Longhorn volume information. Alternatively, access the Longhorn UI directly if its ingress is configured.
## DNS Health
```bash
# Check dnsmasq status on Wild Central
wild dns status
# View current DNS configuration
wild dns config
# Test internal DNS resolution
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
nslookup kubernetes.default.svc.cluster.local
```
## Key Health Indicators
| Component | Healthy Sign | Warning Sign |
|-----------|-------------|--------------|
| Nodes | All Ready, no taints | NotReady, cordoned, or tainted |
| Pods | Running/Completed | CrashLoopBackOff, Pending, Evicted |
| PVCs | Bound | Pending, Lost |
| Longhorn volumes | Healthy, target replica count | Degraded, faulted, rebuilding |
| Backups | Recent, scheduled | No recent backup, failed |
| etcd | Healthy cluster members | Member unreachable, high latency |
| MetalLB | All speakers running | Missing speakers, stale L2 status |
## Setting Up Alerts
Wild Cloud does not currently include a built-in alerting system. For production environments, consider:
1. **Backup scheduling** with verification to catch backup failures early
2. **Periodic health checks** via `wild cluster health` in a cron job
3. **External monitoring** pointing at your app URLs for uptime checks
## Next Steps
- [Cluster Networking Health](cluster-networking-health.md) — Detailed networking health checklist
- [Troubleshoot Cluster](troubleshoot-cluster.md) — When health checks fail
- [Making Backups](making-backups.md) — Set up backup schedules

View File

@@ -1,294 +1,277 @@
# Restoring Backups
This guide will walk you through restoring your applications and cluster from wild-cloud backups. Hopefully you'll never need this, but when you do, it's critical that the process works smoothly.
This guide covers how to restore applications from Wild Cloud backups. The restore system uses a blue-green deployment model — data is restored to a standby namespace so you can verify it before switching traffic.
## Understanding Restore Types
## Understanding the Blue-Green Restore
Your wild-cloud backup system can restore different types of data depending on what you need to recover:
Wild Cloud restores follow a three-phase process:
**Application restores** bring back individual applications by restoring their database contents and file storage. This is what you'll use most often - maybe you accidentally deleted something in Discourse, or Gitea got corrupted, or you want to roll back Immich to before a bad update.
```
Restore → Switch → Cleanup
```
**Cluster restores** are for disaster recovery scenarios where you need to rebuild your entire Kubernetes cluster from scratch. This includes restoring all the cluster's configuration and even its internal state.
1. **Restore**: Creates a standby namespace with restored data. Your active app keeps running untouched.
2. **Switch**: Redirects traffic from the active deployment to the standby. The standby becomes the new active.
3. **Cleanup**: Removes the previous active deployment and resources.
**Configuration restores** bring back your wild-cloud repository and settings, which contain all the "recipes" for how your infrastructure should be set up.
This means restores are safe — if something goes wrong, your active app is still running.
## Before You Start Restoring
## Before You Start
Make sure you have everything needed to perform restores. You need to be in your wild-cloud directory with the environment loaded (`source env.sh`). Your backup repository and password should be configured and working - you can test this by running `restic snapshots` to see your available backups.
Make sure you have:
- kubectl access to your cluster
- Backup destination accessible (same destination where backups were stored)
- The app deployed (or at least added) to your instance
Most importantly, make sure you have kubectl access to your cluster, since restores involve creating temporary pods and manipulating storage.
List available backups first:
```bash
wild backup list gitea
```
## Restoring Applications
### Basic Application Restore
### Full Restore (Automatic)
The most common restore scenario is bringing back a single application. To restore the latest backup of an app:
The simplest approach runs all three phases automatically:
```bash
wild-app-restore discourse
wild restore gitea --auto
```
This restores both the database and all file storage for the discourse app. The restore system automatically figures out what the app needs based on its manifest file and what was backed up.
This restores the latest backup, switches traffic, and cleans up the old deployment in one operation.
If you want to restore from a specific backup instead of the latest:
### Full Restore from Specific Backup
```bash
wild-app-restore discourse abc123
wild restore gitea 20250314T021530Z --auto
```
Where `abc123` is the snapshot ID from `restic snapshots --tag discourse`.
### Step-by-Step Restore (Recommended for Critical Apps)
### Partial Restores
For production apps, run each phase separately so you can verify between steps:
Sometimes you only need to restore part of an application. Maybe the database is fine but the files got corrupted, or vice versa.
To restore only the database:
```bash
wild-app-restore discourse --db-only
```
To restore only the file storage:
```bash
wild-app-restore discourse --pvc-only
```
To restore without database roles and permissions (if they're causing conflicts):
```bash
wild-app-restore discourse --skip-globals
```
### Finding Available Backups
To see what backups are available for an app:
```bash
wild-app-restore discourse --list
```
This shows recent snapshots with their IDs, timestamps, and what was included.
## How Application Restores Work
Understanding what happens during a restore can help when things don't go as expected.
### Database Restoration
When restoring a database, the system first downloads the backup files from your restic repository. It then prepares the database by creating any needed roles, disconnecting existing users, and dropping/recreating the database to ensure a clean restore.
For PostgreSQL databases, it uses `pg_restore` with parallel processing to speed up large database imports. For MySQL, it uses standard mysql import commands. The system also handles database ownership and permissions automatically.
### File Storage Restoration
File storage (PVC) restoration is more complex because it involves safely replacing files that might be actively used by running applications.
First, the system creates a safety snapshot using Longhorn. This means if something goes wrong during the restore, you can get back to where you started. Then it scales your application down to zero replicas so no pods are using the storage.
Next, it creates a temporary utility pod with the PVC mounted and copies all the backup files into place, preserving file permissions and structure. Once the data is restored and verified, it removes the utility pod and scales your application back up.
If everything worked correctly, the safety snapshot is automatically deleted. If something went wrong, the safety snapshot is preserved so you can recover manually.
## Cluster Disaster Recovery
Cluster restoration is much less common but critical when you need to rebuild your entire infrastructure.
### Restoring Kubernetes Resources
To restore all cluster resources from a backup:
**Step 1: Restore to standby**
```bash
# Download cluster backup
restic restore --tag cluster latest --target ./restore/
# Apply all resources
kubectl apply -f restore/cluster/all-resources.yaml
wild restore gitea
```
You can also restore specific types of resources:
```bash
kubectl apply -f restore/cluster/secrets.yaml
kubectl apply -f restore/cluster/configmaps.yaml
```
This creates a standby namespace (e.g., `gitea-green`) with the restored database and files. Your active app continues running in its current namespace.
### Restoring etcd State
**Step 2: Verify the standby**
**Warning: This is extremely dangerous and will affect your entire cluster.**
etcd restoration should only be done when rebuilding a cluster from scratch. For Talos clusters:
Check that the restored app is working:
```bash
talosctl --nodes <control-plane-ip> etcd restore --from ./restore/cluster/etcd-snapshot.db
# Check pods in the standby namespace
kubectl get pods -n gitea-green
# Check logs
kubectl logs -n gitea-green deploy/gitea
# View the recovery plan
wild restore plan gitea
```
This command stops etcd, replaces its data with the backup, and restarts the cluster. Expect significant downtime while the cluster rebuilds itself.
**Step 3: Switch traffic**
## Common Disaster Recovery Scenarios
```bash
wild restore switch gitea
```
This updates the active deployment color in config.yaml and redirects traffic to the standby namespace.
**Step 4: Clean up**
```bash
wild restore cleanup gitea
```
This removes the previous active namespace and resources.
### Web UI
Navigate to **Backups > [app]**, select a backup, and click **Restore**. The UI tracks recovery plan progress through each phase.
## Partial Restores
Restore only specific components:
```bash
# Database only
wild restore gitea --components postgres
# Persistent volumes only
wild restore gitea --components pvc
# Config/manifests only (skip data)
wild restore gitea --skip-data
# Multiple specific components
wild restore gitea --components postgres,pvc
```
## How Each Component Is Restored
### PostgreSQL Databases
The restore creates a standby database named `{dbName}_{standbyColor}` (e.g., `gitea_green`):
1. Downloads the `.dump` file from the backup destination
2. Creates the standby database and user
3. Runs `pg_restore` with the dump file
4. Deploys the app to the standby namespace with kustomize patches that rewrite database connection strings to point to the standby database
### MySQL Databases
Similar to PostgreSQL — creates a standby database, imports the gzip-compressed SQL dump, and patches connection strings.
### Persistent Volumes (Longhorn)
1. Triggers a Longhorn restore from the native backup, creating new volumes with standby naming
2. Generates kustomize patches that bind standby PVCs to the restored volumes via `spec.volumeName`
3. Cache/temp volumes (names containing `-cache` or `-tmp`) are skipped
### Configuration
Extracts the tar.gz archive containing manifests, kustomization, and app-specific config/secrets to the standby app directory.
## Viewing Recovery Plans
Each restore operation creates a recovery plan that tracks progress across all phases:
```bash
wild restore plan gitea
```
The plan shows:
- Current status (restoring, restored, switching, switched, cleaning_up, cleaned_up, or failed)
- Which strategies ran (postgres, longhorn-native, config)
- Per-strategy status and details
- Timestamps for each phase
## Common Restore Scenarios
### Rolling Back After a Bad Update
```bash
# List available backups
wild backup list gitea
# Restore from before the problematic update
wild restore gitea 20250310T020000Z --auto
```
### Complete Application Loss
When an entire application is gone (namespace deleted, pods corrupted, etc.):
If an app's namespace was deleted or corrupted:
```bash
# Make sure the namespace exists
kubectl create namespace discourse --dry-run=client -o yaml | kubectl apply -f -
# Make sure the app is added to the instance
wild app add gitea
# Apply the application manifests if needed
kubectl apply -f apps/discourse/
# Deploy the app (creates namespace and base resources)
wild app deploy gitea
# Restore the application data
wild-app-restore discourse
# Restore data from backup
wild restore gitea --auto
```
### Complete Cluster Rebuild
### Database-Only Recovery
When rebuilding a cluster from scratch:
If the app is running but the database is corrupted:
First, build your new cluster infrastructure and install wild-cloud components. Then configure backup access so you can reach your backup repository.
Restore cluster state:
```bash
restic restore --tag cluster latest --target ./restore/
# Apply etcd snapshot using appropriate method for your cluster type
# Restore only the database to standby
wild restore gitea --components postgres
# Verify the restored database
kubectl exec -n postgres deploy/postgres -- \
psql -U postgres -d gitea_green -c "SELECT count(*) FROM repository;"
# Switch to the restored database
wild restore switch gitea
# Clean up
wild restore cleanup gitea
```
Finally, restore all applications:
```bash
# See what applications are backed up
wild-app-restore --list
### Cross-Cluster Migration
# Restore each application individually
wild-app-restore discourse
wild-app-restore gitea
wild-app-restore immich
On the source cluster:
```bash
wild backup gitea
```
### Rolling Back After Bad Changes
Sometimes you need to undo recent changes to an application:
On the target cluster:
```bash
# See available snapshots
wild-app-restore discourse --list
# Restore from before the problematic changes
wild-app-restore discourse abc123
```
## Cross-Cluster Migration
You can use backups to move applications between clusters:
On the source cluster, create a fresh backup:
```bash
wild-app-backup discourse
```
On the target cluster, deploy the application manifests:
```bash
kubectl apply -f apps/discourse/
```
Then restore the data:
```bash
wild-app-restore discourse
```
## Verifying Successful Restores
After any restore, verify that everything is working correctly.
For databases, check that you can connect and see expected data:
```bash
kubectl exec -n postgres deploy/postgres-deployment -- \
psql -U postgres -d discourse -c "SELECT count(*) FROM posts;"
```
For file storage, check that files exist and applications can start:
```bash
kubectl get pods -n discourse
kubectl logs -n discourse deployment/discourse
```
For web applications, test that you can access them:
```bash
curl -f https://discourse.example.com/latest.json
wild app add gitea
wild app deploy gitea
wild restore gitea --auto
```
## When Things Go Wrong
### No Snapshots Found
### Restore Fails Mid-Way
If the restore system can't find backups for an application, check that snapshots exist:
```bash
restic snapshots --tag discourse
```
If the restore phase fails, your active app is untouched. The standby namespace may contain partial data. You can:
- Fix the issue and retry: `wild restore gitea`
- Check what went wrong: `wild restore plan gitea`
- Clean up the failed standby manually: `kubectl delete namespace gitea-green`
Make sure you're using the correct app name and that backups were actually created successfully.
### Switch Fails
### Database Restore Failures
If the switch phase fails, the standby is fully populated and ready. You can:
- Retry the switch: `wild restore switch gitea`
- Inspect both namespaces and manually update config if needed
Database restores can fail if the target database isn't accessible or if there are permission issues. Check that your postgres or mysql pods are running and that you can connect to them manually.
### App Won't Start After Restore
Review the restore error messages carefully - they usually indicate whether the problem is with the backup file, database connectivity, or permissions.
### PVC Restore Failures
If PVC restoration fails, check that you have sufficient disk space and that the PVC isn't being used by other pods. The error messages will usually indicate what went wrong.
Most importantly, remember that safety snapshots are preserved when PVC restores fail. You can see them with:
```bash
kubectl get snapshot.longhorn.io -n longhorn-system -l app=wild-app-restore
```
These snapshots let you recover to the pre-restore state if needed.
### Application Won't Start After Restore
If pods fail to start after restoration, check file permissions and ownership. Sometimes the restoration process doesn't perfectly preserve the exact permissions that the application expects.
You can also try scaling the application to zero and back to one, which sometimes resolves transient issues:
```bash
kubectl scale deployment/discourse -n discourse --replicas=0
kubectl scale deployment/discourse -n discourse --replicas=1
```
## Manual Recovery
When automated restore fails, you can always fall back to manual extraction and restoration:
Check file permissions and ownership in the restored PVCs. Try scaling to zero and back:
```bash
# Extract backup files to local directory
restic restore --tag discourse latest --target ./manual-restore/
# Manually copy database dump to postgres pod
kubectl cp ./manual-restore/discourse/database_*.dump \
postgres/postgres-deployment-xxx:/tmp/
# Manually restore database
kubectl exec -n postgres deploy/postgres-deployment -- \
pg_restore -U postgres -d discourse /tmp/database_*.dump
kubectl scale deployment/gitea -n gitea-green --replicas=0
kubectl scale deployment/gitea -n gitea-green --replicas=1
```
For file restoration, you'd need to create a utility pod and manually copy files into the PVC.
### No Backups Found
```bash
# List all backups for the app
wild backup list gitea
# Check backup destination is configured
wild config get backup.destination
```
## Verifying a Successful Restore
After any restore, verify:
```bash
# Check pods are running
kubectl get pods -n gitea
# Check logs for errors
kubectl logs -n gitea deploy/gitea
# Test database connectivity
kubectl exec -n postgres deploy/postgres -- \
psql -U postgres -d gitea -c "SELECT 1;"
# Test web access
curl -f https://gitea.example.com/
```
## Best Practices
Test your restore procedures regularly in a non-production environment. It's much better to discover issues with your backup system during a planned test than during an actual emergency.
- **Test restores regularly** in a test environment. Backups are worthless if restore doesn't work.
- **Use step-by-step restore** for production apps so you can verify before switching traffic.
- **Monitor after restore** — watch the app more closely than usual for a few days.
- **Communicate with users** before performing restores that involve downtime.
Always communicate with users before performing restores, especially if they involve downtime. Document any manual steps you had to take so you can improve the automated process.
## Next Steps
After any significant restore, monitor your applications more closely than usual for a few days. Sometimes problems don't surface immediately.
## Security and Access Control
Restore operations are powerful and can be destructive. Make sure only trusted administrators can perform restores, and consider requiring approval or coordination before major restoration operations.
Be aware that cluster restores include all secrets, so they potentially expose passwords, API keys, and certificates. Ensure your backup repository is properly secured.
Remember that Longhorn safety snapshots are preserved when things go wrong. These snapshots may contain sensitive data, so clean them up appropriately once you've resolved any issues.
## What's Next
The best way to get comfortable with restore operations is to practice them in a safe environment. Set up a test cluster and practice restoring applications and data.
Consider creating runbooks for your most likely disaster scenarios, including the specific commands and verification steps for your infrastructure.
Read the [Making Backups](making-backups.md) guide to ensure you're creating the backups you'll need for successful recovery.
- [Making Backups](making-backups.md) — Set up backup schedules and destinations
- [Disaster Recovery](disaster-recovery.md) — Full cluster rebuild from backups

View File

@@ -1,19 +1,136 @@
# Troubleshoot Wild Cloud Cluster issues
# Troubleshoot Wild Cloud Cluster Issues
## Quick Health Check
```bash
# Wild Cloud cluster health (runs multiple checks)
wild cluster health
# Cluster status overview
wild cluster status
```
The web app **Dashboard** also shows health check results with pass/fail details.
## General Troubleshooting Steps
1. **Check Node Status**:
```bash
kubectl get nodes
kubectl describe node <node-name>
```
### 1. Check Node Status
1. **Check Component Status**:
```bash
# Check all pods across all namespaces
kubectl get pods -A
# Look for pods that aren't Running or Ready
kubectl get pods -A | grep -v "Running\|Completed"
```
```bash
# Kubernetes node status
kubectl get nodes -o wide
# Detailed node info (look for conditions, taints, capacity)
kubectl describe node <node-name>
# Talos node health (from Wild Central)
talosctl --talosconfig <talosconfig-path> health --nodes <node-ip>
```
### 2. Check Pod Status
```bash
# All pods across all namespaces
kubectl get pods -A
# Pods that aren't Running or Completed
kubectl get pods -A | grep -v "Running\|Completed"
# Recent events (often reveals scheduling or resource issues)
kubectl get events -A --sort-by='.lastTimestamp' | head -30
```
### 3. Check Control Plane Components
On Talos clusters, control plane components run as static pods:
```bash
# Check control plane pods
kubectl get pods -n kube-system
# Check etcd health
talosctl --talosconfig <talosconfig-path> etcd status --nodes <control-plane-ip>
# Check Talos services
talosctl --talosconfig <talosconfig-path> services --nodes <node-ip>
```
### 4. Check Resource Pressure
```bash
# Node resource usage
kubectl top nodes
# Pod resource usage
kubectl top pods -A --sort-by=memory
```
## Common Issues
### Node Not Ready
```bash
# Check node conditions
kubectl describe node <node-name> | grep -A5 "Conditions:"
# Check Talos logs for the node
talosctl --talosconfig <talosconfig-path> logs kubelet --nodes <node-ip> | tail -50
```
Common causes: network connectivity loss, disk pressure, memory pressure, kubelet crash.
### Pods Stuck in Pending
```bash
# Check why the pod can't be scheduled
kubectl describe pod <pod-name> -n <namespace>
```
Common causes: insufficient resources, node affinity/taint mismatch, PVC not bound.
### Pods in CrashLoopBackOff
```bash
# Check container logs
kubectl logs <pod-name> -n <namespace> --previous
# Check events for the pod
kubectl describe pod <pod-name> -n <namespace>
```
Common causes: missing config/secrets, database not reachable, permission errors.
### etcd Issues
```bash
# Check etcd members
talosctl --talosconfig <talosconfig-path> etcd members --nodes <control-plane-ip>
# Check etcd health
talosctl --talosconfig <talosconfig-path> etcd status --nodes <control-plane-ip>
# If etcd has a stale member (node was replaced)
talosctl --talosconfig <talosconfig-path> etcd remove-member <stale-node-name> --nodes <healthy-node-ip>
```
### Lost Connectivity to Cluster
If `kubectl` and `talosctl` can't reach the cluster:
```bash
# Check if the VIP is responding
ping <control-plane-vip>
# Try reaching individual node IPs directly
talosctl --talosconfig <talosconfig-path> version --nodes <node-ip>
# Regenerate kubeconfig if needed
wild cluster kubeconfig --generate
```
## Related Guides
- [Cluster Networking Health](cluster-networking-health.md) — Full networking stack checklist
- [Troubleshoot DNS](troubleshoot-dns.md) — DNS resolution issues
- [Troubleshoot Service Connectivity](troubleshoot-service-connectivity.md) — Inter-service communication
- [Disaster Recovery](disaster-recovery.md) — Rebuilding from scratch

View File

@@ -1,20 +1,98 @@
# Troubleshoot DNS
If DNS resolution isn't working properly:
Wild Cloud uses two DNS layers: **CoreDNS** inside the cluster for service discovery, and **dnsmasq** on Wild Central for LAN-local domain resolution.
1. Check CoreDNS status:
```bash
kubectl get pods -n kube-system -l k8s-app=kube-dns
kubectl logs -l k8s-app=kube-dns -n kube-system
```
## Cluster DNS (CoreDNS)
2. Verify CoreDNS configuration:
```bash
kubectl get configmap -n kube-system coredns -o yaml
```
If pods can't resolve service names or external domains:
3. Test DNS resolution from inside the cluster:
```bash
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- nslookup kubernetes.default
```
### 1. Check CoreDNS Status
```bash
kubectl get pods -n kube-system -l k8s-app=kube-dns
kubectl logs -l k8s-app=kube-dns -n kube-system
```
### 2. Verify CoreDNS Configuration
```bash
kubectl get configmap -n kube-system coredns -o yaml
```
### 3. Test DNS Resolution from Inside the Cluster
```bash
# Test cluster-internal DNS
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
nslookup kubernetes.default.svc.cluster.local
# Test external DNS resolution
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- \
nslookup google.com
```
### 4. Check Upstream DNS Reachability
Talos uses a DNS proxy at `169.254.116.108`. If CoreDNS can't resolve external names:
```bash
# Check if the Talos DNS proxy is responding
talosctl --talosconfig <talosconfig-path> get resolvers --nodes <node-ip>
```
## LAN DNS (dnsmasq on Wild Central)
If devices on your LAN can't resolve Wild Cloud domains (e.g., `gitea.cloud.example.com`):
### 1. Check dnsmasq Status
```bash
wild dns status
```
### 2. View Current Configuration
```bash
wild dns config
```
### 3. Test LAN DNS Resolution
From a device on the LAN:
```bash
# Query Wild Central directly
nslookup gitea.cloud.example.com <wild-central-ip>
# Compare with public DNS
nslookup gitea.cloud.example.com 8.8.8.8
```
### 4. Restart dnsmasq
```bash
wild dns restart
```
### 5. Regenerate Configuration
If DNS entries are missing or stale:
```bash
# Preview changes
wild dns update --dry-run
# Apply
wild dns update
```
## Common Issues
**LAN devices can't resolve Wild Cloud domains**: Ensure your router is configured to use Wild Central's IP as its DNS server.
**Pods can resolve cluster services but not external domains**: Check CoreDNS upstream forwarder configuration and Talos DNS proxy health.
**DNS works but only after a long delay**: Check for timeout issues in CoreDNS forwarder chain. Verify the external resolver configured in `cluster.internalDns.externalResolver`.
## Related Guides
- [Troubleshoot Service Visibility](troubleshoot-visibility.md) — Full external access troubleshooting
- [Cluster Networking Health](cluster-networking-health.md) — DNS is item #5-6 on the checklist

View File

@@ -1,18 +1,67 @@
# Troubleshoot Service Connectivity
If services can't communicate:
If services within the cluster can't communicate with each other:
1. Check network policies:
```bash
kubectl get networkpolicies -A
```
## 1. Check Network Policies
2. Verify service endpoints:
```bash
kubectl get endpoints -n <namespace>
```
```bash
kubectl get networkpolicies -A
```
3. Test connectivity from within the cluster:
```bash
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- wget -O- <service-name>.<namespace>
```
Wild Cloud doesn't create restrictive network policies by default, but CrowdSec or custom policies may be blocking traffic.
## 2. Verify Service Endpoints
```bash
# Check that the service has endpoints
kubectl get endpoints -n <namespace>
# A service with no endpoints means no pods match its selector
kubectl describe svc <service-name> -n <namespace>
```
## 3. Test Connectivity from Within the Cluster
```bash
# Start a debug pod
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- sh
# Inside the pod:
# Test DNS resolution
nslookup <service-name>.<namespace>.svc.cluster.local
# Test HTTP connectivity
wget -O- http://<service-name>.<namespace>.svc.cluster.local:<port>
```
## 4. Check Cross-Node Connectivity
If services on different nodes can't communicate:
```bash
# Verify Flannel (CNI) pods are running on every node
kubectl get pods -n kube-system -l app=flannel -o wide
# Check for stale VXLAN tunnels
talosctl --talosconfig <talosconfig-path> get links --nodes <node-ip> | grep flannel
```
## 5. Check kube-proxy
```bash
# Verify kube-proxy is running on all nodes
kubectl get pods -n kube-system -l k8s-app=kube-proxy -o wide
```
## Common Issues
**App can't reach its database**: Check that the database pod is running, the service name matches what the app expects, and the database namespace is correct.
**Intermittent connectivity failures**: Often caused by a Flannel pod crash or stale routing. Restart the Flannel pod on the affected node.
**CrowdSec blocking legitimate traffic**: Check CrowdSec decisions and bouncer status. See the CrowdSec service logs: `wild service logs crowdsec`.
## Related Guides
- [Cluster Networking Health](cluster-networking-health.md) — Full networking stack checklist
- [Troubleshoot DNS](troubleshoot-dns.md) — If the issue is DNS-related

View File

@@ -1,24 +1,96 @@
# Troubleshoot TLS Certificates
If services show invalid certificates:
Wild Cloud uses cert-manager with Let's Encrypt for TLS certificates. Two shared wildcard certificates are issued and copied to app namespaces during deployment:
1. Check certificate status:
```bash
kubectl get certificates -A
```
- `wildcard-wild-cloud-tls` — public domain (e.g., `*.cloud.example.com`)
- `wildcard-internal-wild-cloud-tls` — internal domain (e.g., `*.internal.cloud.example.com`)
2. Examine certificate details:
```bash
kubectl describe certificate <cert-name> -n <namespace>
```
## 1. Check Certificate Status
3. Check for cert-manager issues:
```bash
kubectl get pods -n cert-manager
kubectl logs -l app=cert-manager -n cert-manager
```
```bash
kubectl get certificates -A
```
4. Verify the Cloudflare API token is correctly set up:
```bash
kubectl get secret cloudflare-api-token -n internal
```
Look for `Ready: True`. If `False`, describe the certificate for details:
```bash
kubectl describe certificate <cert-name> -n cert-manager
```
## 2. Check cert-manager Pods
```bash
kubectl get pods -n cert-manager
kubectl logs -l app=cert-manager -n cert-manager | tail -50
```
## 3. Check Certificate Orders and Challenges
```bash
# Check pending orders
kubectl get orders -A
# Check active challenges
kubectl get challenges -A
# Describe a failing challenge for details
kubectl describe challenge <challenge-name> -n cert-manager
```
## 4. Verify the Cloudflare API Token
cert-manager uses DNS-01 validation via Cloudflare. Verify the token is present:
```bash
kubectl get secret cloudflare-api-token -n cert-manager
```
If the token is missing or invalid, check your secrets and redeploy cert-manager:
```bash
wild service install cert-manager
```
## 5. Check the ClusterIssuer
```bash
kubectl get clusterissuers
kubectl describe clusterissuer letsencrypt-prod
```
Look for `Status: True` on the Ready condition. If using staging for testing:
```bash
kubectl describe clusterissuer letsencrypt-staging
```
## 6. Force Certificate Renewal
If a certificate is stuck, delete it and let cert-manager re-issue:
```bash
kubectl delete certificate <cert-name> -n cert-manager
```
cert-manager will automatically create a new certificate request.
## 7. Repair Certificates Script
cert-manager includes a repair script for bulk certificate issues:
```bash
# If cert-manager was installed via Wild Cloud
kubectl exec -n cert-manager deploy/cert-manager -- /scripts/repair-certificates.sh
```
## Common Issues
**Challenge fails with NXDOMAIN**: ExternalDNS hasn't created the DNS record yet, or Cloudflare zone ID is wrong. Check `cluster.certManager.cloudflare.zoneID` in config.
**Rate limited by Let's Encrypt**: Production rate limits are 50 certificates per domain per week. Switch to `letsencrypt-staging` for testing.
**Certificate exists but app shows invalid cert**: The wildcard secret may not have been copied to the app namespace. Redeploy the app: `wild app deploy <app>`.
## Related Guides
- [Troubleshoot Service Visibility](troubleshoot-visibility.md) — TLS is one layer of the visibility stack

View File

@@ -6,10 +6,10 @@ This guide covers common issues with accessing services from outside the cluster
External access to your services might fail for several reasons:
1. **DNS Resolution Issues** - Domain names not resolving to the correct IP address
2. **Network Connectivity Issues** - Traffic can't reach the cluster's external IP
3. **TLS Certificate Issues** - Invalid or missing certificates
4. **Ingress/Service Configuration Issues** - Incorrectly configured routing
1. **DNS Resolution Issues** Domain names not resolving to the correct IP address
2. **Network Connectivity Issues** Traffic can't reach the cluster's external IP
3. **TLS Certificate Issues** Invalid or missing certificates
4. **Ingress/Service Configuration Issues** Incorrectly configured routing
## Diagnostic Steps
@@ -18,8 +18,8 @@ External access to your services might fail for several reasons:
**Symptoms:**
- Browser shows "site cannot be reached" or "server IP address could not be found"
- `ping` or `nslookup` commands fail for your domain
- Your service DNS records don't appear in CloudFlare or your DNS provider
- `nslookup` fails for your domain
- DNS records don't appear in Cloudflare
**Checks:**
@@ -31,35 +31,34 @@ nslookup yourservice.yourdomain.com
kubectl get pods -n externaldns
# Check ExternalDNS logs for errors
kubectl logs -n externaldns -l app=external-dns < /dev/null | grep -i error
kubectl logs -n externaldns -l app=external-dns | grep -i error
kubectl logs -n externaldns -l app=external-dns | grep -i "your-service-name"
# Check if CloudFlare API token is configured correctly
# Verify the Cloudflare API token secret exists
kubectl get secret cloudflare-api-token -n externaldns
```
**Common Issues:**
a) **ExternalDNS Not Running**: The ExternalDNS pod is not running or has errors.
a) **ExternalDNS Not Running**: The pod is not running or has errors.
b) **Cloudflare API Token Issues**: The API token is invalid, expired, or doesn't have the right permissions.
b) **Cloudflare API Token Issues**: Token is invalid, expired, or lacks permissions.
c) **Domain Filter Mismatch**: ExternalDNS is configured with a `--domain-filter` that doesn't match your domain.
c) **Domain Filter Mismatch**: ExternalDNS `--domain-filter` doesn't match your domain.
d) **Annotations Missing**: Service or Ingress is missing the required ExternalDNS annotations.
d) **Annotations Missing**: Ingress is missing the required ExternalDNS annotations.
**Solutions:**
```bash
# 1. Recreate CloudFlare API token secret
# 1. Recreate Cloudflare API token secret
kubectl create secret generic cloudflare-api-token \
--namespace externaldns \
--from-literal=api-token="your-api-token" \
--dry-run=client -o yaml | kubectl apply -f -
# 2. Check and set proper annotations on your Ingress:
kubectl annotate ingress your-ingress -n your-namespace \
external-dns.alpha.kubernetes.io/hostname=your-service.your-domain.com
# 2. Verify ingress annotations
kubectl get ingress -n <app-namespace> -o yaml | grep external-dns
# 3. Restart ExternalDNS
kubectl rollout restart deployment -n externaldns external-dns
@@ -82,26 +81,31 @@ kubectl get pods -n metallb-system
# Check MetalLB IP address pool
kubectl get ipaddresspools.metallb.io -n metallb-system
# Verify the service has an external IP
kubectl get svc -n your-namespace your-service
# Verify the Traefik service has an external IP
kubectl get svc -n traefik
```
**Common Issues:**
a) **MetalLB Configuration**: The IP pool doesn't match your network or is exhausted.
a) **MetalLB Configuration**: IP pool doesn't match your network or is exhausted.
b) **Firewall Issues**: Firewall is blocking traffic to your cluster's external IP.
b) **MetalLB L2 Announcements**: Stale ServiceL2Status entries blocking ARP announcements.
c) **Router Configuration**: NAT or port forwarding issues if using a router.
c) **Firewall Issues**: Firewall blocking traffic to the cluster's load balancer IP.
d) **Router Configuration**: NAT or port forwarding issues.
**Solutions:**
```bash
# 1. Check and update MetalLB configuration
kubectl apply -f infrastructure_setup/metallb/metallb-pool.yaml
# Check MetalLB L2 advertisement status
kubectl get servicel2statuses.metallb.io -n metallb-system
# 2. Check service external IP assignment
kubectl describe svc -n your-namespace your-service
# Verify MetalLB speaker pods are running on all nodes
kubectl get pods -n metallb-system -l component=speaker -o wide
# Reinstall MetalLB if configuration is wrong
wild service install metallb
```
### 3. Check TLS Certificates
@@ -110,40 +114,42 @@ kubectl describe svc -n your-namespace your-service
- Browser shows certificate errors
- "Your connection is not private" warnings
- Cert-manager logs show errors
- cert-manager logs show errors
**Checks:**
```bash
# Check certificate status
kubectl get certificates -A
kubectl get certificates -n cert-manager
# Check cert-manager logs
kubectl logs -n cert-manager -l app=cert-manager
kubectl logs -n cert-manager -l app=cert-manager | tail -30
# Check if your ingress is using the correct certificate
kubectl get ingress -n your-namespace your-ingress -o yaml
# Check if the ingress is using the correct TLS secret
kubectl get ingress -n <app-namespace> -o yaml | grep secretName
```
**Common Issues:**
a) **Certificate Issuance Failures**: DNS validation or HTTP validation failing.
a) **Certificate Issuance Failures**: DNS-01 validation failing (Cloudflare token or zone ID wrong).
b) **Wrong Secret Referenced**: Ingress is referencing a non-existent certificate secret.
b) **Wrong Secret Referenced**: Ingress referencing a non-existent secret.
c) **Expired Certificate**: Certificate has expired and wasn't renewed.
c) **Secret Not Copied**: Wildcard TLS secret not copied to the app namespace during deploy.
**Solutions:**
```bash
# 1. Check and recreate certificates
kubectl apply -f infrastructure_setup/cert-manager/wildcard-certificate.yaml
# Force re-issue certificates
kubectl delete certificate wildcard-wild-cloud-tls -n cert-manager
# cert-manager will automatically re-create it
# 2. Update ingress to use correct secret
kubectl patch ingress your-ingress -n your-namespace --type=json \
-p='[{"op": "replace", "path": "/spec/tls/0/secretName", "value": "correct-secret-name"}]'
# Redeploy the app to copy TLS secrets to its namespace
wild app deploy <app>
```
See [Troubleshoot TLS Certificates](troubleshoot-tls-certificates.md) for detailed cert debugging.
### 4. Check Ingress Configuration
**Symptoms:**
@@ -156,18 +162,18 @@ kubectl patch ingress your-ingress -n your-namespace --type=json \
```bash
# Check ingress status
kubectl get ingress -n your-namespace
kubectl get ingress -n <app-namespace>
# Check Traefik logs
kubectl logs -n kube-system -l app.kubernetes.io/name=traefik
kubectl logs -n traefik -l app=traefik | tail -30
# Check ingress configuration
kubectl describe ingress -n your-namespace your-ingress
# Check ingress details
kubectl describe ingress -n <app-namespace> <ingress-name>
```
**Common Issues:**
a) **Incorrect Service Targeting**: Ingress is pointing to wrong service or port.
a) **Incorrect Service Targeting**: Ingress pointing to wrong service or port.
b) **Traefik Configuration**: IngressClass or middleware issues.
@@ -176,71 +182,86 @@ c) **Path Configuration**: Incorrect path prefixes or regex.
**Solutions:**
```bash
# 1. Verify ingress configuration
kubectl edit ingress -n your-namespace your-ingress
# Verify the referenced service exists and has endpoints
kubectl get svc -n <app-namespace>
kubectl get endpoints -n <app-namespace>
# 2. Check that the referenced service exists
kubectl get svc -n your-namespace
# 3. Restart Traefik if needed
kubectl rollout restart deployment -n kube-system traefik
# Restart Traefik
kubectl rollout restart deployment -n traefik traefik
```
## Advanced Diagnostics
For more complex issues, you can use port-forwarding to test services directly:
### Port-Forward to Test Directly
Bypass ingress and test the service directly:
```bash
# Port-forward the service directly
kubectl port-forward -n your-namespace svc/your-service 8080:80
# Port-forward the service
kubectl port-forward -n <app-namespace> svc/<service-name> 8080:80
# Then test locally
# Test locally
curl http://localhost:8080
```
You can also deploy a debug pod to test connectivity from inside the cluster:
### Debug Pod for In-Cluster Testing
```bash
# Start a debug pod
kubectl run -i --tty --rm debug --image=busybox --restart=Never -- sh
# Inside the pod, test DNS and connectivity
nslookup your-service.your-namespace.svc.cluster.local
wget -O- http://your-service.your-namespace.svc.cluster.local
nslookup <service-name>.<namespace>.svc.cluster.local
wget -O- http://<service-name>.<namespace>.svc.cluster.local
```
### LAN DNS (dnsmasq)
If services are reachable from outside the LAN but not from within:
```bash
# Check dnsmasq status on Wild Central
wild dns status
# Verify dnsmasq resolves your domain correctly
nslookup yourservice.yourdomain.com <wild-central-ip>
# Regenerate dnsmasq config if entries are stale
wild dns update
```
This is typically a hairpin NAT issue — dnsmasq on Wild Central resolves LAN-local domains to the cluster's load balancer IP so internal devices don't need to go through the router.
## ExternalDNS Specifics
ExternalDNS can be particularly troublesome. Here are specific debugging steps:
1. **Check Log Level**: Set `--log-level=debug` for more detailed logs
1. **Check Log Level**: Set `--log-level=debug` for detailed logs
2. **Check Domain Filter**: Ensure `--domain-filter` includes your domain
3. **Check Provider**: Ensure `--provider=cloudflare` (or your DNS provider)
4. **Verify API Permissions**: CloudFlare token needs Zone.Zone and Zone.DNS permissions
3. **Check Provider**: Ensure `--provider=cloudflare`
4. **Verify API Permissions**: Cloudflare token needs Zone.Zone (Read) and Zone.DNS (Edit) permissions
5. **Check TXT Records**: ExternalDNS uses TXT records for ownership tracking
```bash
# Restart with verbose logging
kubectl set env deployment/external-dns -n externaldns -- --log-level=debug
# Check for specific domain errors
kubectl logs -n externaldns -l app=external-dns | grep -i yourservice.yourdomain.com
```
## CloudFlare Specific Issues
## Cloudflare Specific Issues
When using CloudFlare, additional issues may arise:
1. **API Rate Limiting**: Cloudflare may rate limit frequent API calls
2. **DNS Propagation**: Changes may take time to propagate through Cloudflare's network
3. **Proxied Records**: The `external-dns.alpha.kubernetes.io/cloudflare-proxied` annotation controls whether Cloudflare proxies traffic
4. **API Token Permissions**: Token must have Zone:Zone:Read and Zone:DNS:Edit permissions
5. **Zone Detection**: If using subdomains, ensure the parent domain is in the domain filter
1. **API Rate Limiting**: CloudFlare may rate limit frequent API calls
2. **DNS Propagation**: Changes may take time to propagate through CloudFlare's CDN
3. **Proxied Records**: The `external-dns.alpha.kubernetes.io/cloudflare-proxied` annotation controls whether CloudFlare proxies traffic
4. **Access Restrictions**: CloudFlare Access or Page Rules may restrict access
5. **API Token Permissions**: The token must have Zone:Zone:Read and Zone:DNS:Edit permissions
6. **Zone Detection**: If using subdomains, ensure the parent domain is included in the domain filter
Check CloudFlare dashboard for:
Check the Cloudflare dashboard for:
- DNS record existence
- API access logs
- DNS settings including proxy status
- Any error messages or rate limit warnings
- Rate limit warnings
## Related Guides
- [Troubleshoot DNS](troubleshoot-dns.md) — Cluster and LAN DNS issues
- [Troubleshoot TLS Certificates](troubleshoot-tls-certificates.md) — Certificate-specific debugging
- [Cluster Networking Health](cluster-networking-health.md) — Full networking stack checklist

View File

@@ -1,3 +1,173 @@
# Upgrade Applications
TBD
This guide covers upgrading Wild Cloud applications to newer versions from the Wild Directory.
## Check for Available Updates
### Web UI
The **Apps > Installed** page shows update indicators when a newer version is available in the Wild Directory. Click an app to see the current and available versions.
### CLI
```bash
# Show the upgrade plan for a specific app
wild app upgrade-plan gitea
```
The upgrade plan shows:
- Current installed version
- Target version in the Wild Directory
- Whether the upgrade is direct or requires waypoints
- Whether a backup is recommended or required
## Simple Upgrade (No Breaking Changes)
Most app updates are straightforward — a new container image tag with compatible config:
```bash
# Update the app from the Wild Directory
wild app update gitea
```
This will:
1. Fetch the latest version from the Wild Directory
2. Merge any new defaultConfig fields with your existing config
3. Recompile templates
4. Deploy the updated manifests
### Step-by-Step (Review Before Deploy)
For more control:
```bash
# Check what will change
wild app upgrade-plan gitea
# Fetch updated files without deploying
wild app update gitea --no-deploy
# Review the changes in your instance data directory
# (e.g., diff the compiled manifests)
# Deploy when ready
wild app deploy gitea
```
## Waypoint Upgrades (Breaking Changes)
Some apps require stepping through intermediate versions due to database schema changes or incompatible config formats. The upgrade system handles this automatically.
### How Waypoints Work
If an app defines upgrade routing rules (in `app.yaml`), the system computes a multi-step upgrade path:
```
Current: 1.5.0 → Waypoint: 2.0.0 (slot "2") → Target: 3.0.0 (slot "3")
```
Each step may include:
- **Pre-deploy migrations**: Database schema changes needed before the new version starts
- **Post-deploy migrations**: Data backfills or cleanup after the new version is running
- **Config migrations**: Automatic renaming of config keys
### Running a Waypoint Upgrade
```bash
# View the full upgrade plan
wild app upgrade-plan discourse
# Run the upgrade (handles all steps automatically)
wild app update discourse
```
The system processes each waypoint in order, running migrations at each step.
## Backup Before Upgrading
The upgrade plan will indicate when backups are recommended or required:
- **Required**: The upgrade won't proceed without a backup. Create one first:
```bash
wild backup gitea
```
- **Recommended**: The upgrade will proceed but warns you. Create a backup for safety:
```bash
wild backup gitea
```
## Rolling Back After a Bad Upgrade
If an upgrade causes problems, restore from your pre-upgrade backup:
```bash
# List available backups
wild backup list gitea
# Restore the pre-upgrade backup
wild restore gitea <pre-upgrade-timestamp> --auto
```
If you didn't take a backup, you can try reverting to the previous version:
```bash
# Re-add the old version (if the Wild Directory still has it as a waypoint)
wild app add gitea --version <old-version>
wild app deploy gitea
```
## Infrastructure Service Upgrades
Infrastructure services (MetalLB, Traefik, cert-manager, etc.) follow the same update process:
```bash
# Check status
wild service status traefik
# Update
wild service update traefik
```
Or reinstall from the Wild Directory:
```bash
wild service install traefik --fetch
```
## Troubleshooting
### App won't start after upgrade
```bash
# Check pod status and logs
kubectl get pods -n <app>
kubectl logs -n <app> deploy/<app>
# Check events for scheduling or resource issues
kubectl get events -n <app> --sort-by='.lastTimestamp'
```
### Database migration failed
Check the migration job:
```bash
kubectl get jobs -n <app>
kubectl logs job/<migration-job-name> -n <app>
```
Migration jobs are designed to be idempotent — you can re-run the upgrade after fixing the issue.
### Config key errors after upgrade
If templates reference old config keys, the upgrade may have included `configMigrations` that didn't run. Check the app's manifest for renamed keys and update your config manually:
```bash
wild config show | grep <app>
```
## Related Guides
- [Making Backups](making-backups.md) — Always backup before upgrading
- [Restoring Backups](restoring-backups.md) — Rolling back after a bad upgrade

View File

@@ -1,3 +1,66 @@
# Upgrade Kubernetes
TBD
In Wild Cloud, Kubernetes is bundled with Talos Linux. Upgrading Kubernetes means upgrading Talos to a version that includes the desired Kubernetes release.
## How It Works
Each Talos version ships with a specific Kubernetes version. When you upgrade a node's Talos version, the Kubernetes components on that node are upgraded automatically.
Check which Kubernetes version is bundled with a Talos release at the [Talos release notes](https://www.talos.dev/latest/introduction/what-is-new/).
## Check Current Versions
```bash
# Current Kubernetes version
wild cluster status
# Current Talos and Kubernetes versions per node
wild node list
# kubectl version
kubectl version
```
## Upgrade Process
Since Kubernetes upgrades are part of Talos upgrades, follow the [Upgrade Talos](upgrade-talos.md) guide. The key points:
1. **Identify the target Talos version** that includes the Kubernetes version you want
2. **Upgrade worker nodes first**, one at a time
3. **Upgrade control plane nodes last**, one at a time, verifying cluster health between each
4. **Verify** the cluster is healthy after all nodes are upgraded
```bash
# After upgrading all nodes, verify the Kubernetes version
wild cluster status
kubectl version
```
## Kubernetes Version Skew
Talos enforces Kubernetes version compatibility automatically. Within a multi-node cluster during a rolling upgrade:
- Control plane components can differ by at most 1 minor version
- kubelet can be up to 2 minor versions behind the API server
Upgrade nodes one at a time to stay within these bounds.
## Troubleshooting
### Pods stuck after Kubernetes upgrade
Some workloads may need to be restarted after a Kubernetes upgrade:
```bash
# Restart all deployments in a namespace
kubectl rollout restart deployment -n <namespace>
```
### API incompatibilities
If you skip multiple Kubernetes minor versions, deprecated APIs may break manifests. Check the [Kubernetes deprecation guide](https://kubernetes.io/docs/reference/using-api/deprecation-guide/) for removed APIs.
## Related Guides
- [Upgrade Talos](upgrade-talos.md) — The actual upgrade procedure
- [Troubleshoot Cluster](troubleshoot-cluster.md) — Post-upgrade issues

View File

@@ -1,3 +1,147 @@
# Upgrade Talos
TBD
This guide covers upgrading Talos Linux on your cluster nodes. Talos upgrades update the OS and core components on each node individually.
## Prerequisites
- Cluster is healthy: `wild cluster health`
- You know your current Talos version: `wild node list`
- You know the target version: `wild talos versions`
## Check Available Versions
```bash
# List stable Talos versions
wild talos versions
# Include pre-release versions
wild talos versions --all
```
## Validate Schematic Compatibility
Before upgrading, verify your node schematic (system extensions like NVIDIA drivers, NFS, etc.) is compatible with the target version:
```bash
wild talos validate <schematic-id> <target-version>
```
If extensions are missing for the target version, you may need to create a new schematic at [Image Factory](https://factory.talos.dev).
## Upgrade a Node
```bash
wild node upgrade <hostname> <target-version>
```
This will:
1. Validate the schematic-version compatibility
2. Check that the local `talosctl` is compatible (max 1 minor version gap)
3. Cordon the node (prevent new workloads from scheduling)
4. Drain the node (evict running workloads)
5. Upgrade Talos using the Image Factory installer
6. Reboot the node
7. Update the node's version in config.yaml
The operation runs asynchronously. Monitor progress with:
```bash
wild operation list
```
## Upgrade Order
For multi-node clusters, upgrade nodes one at a time:
1. **Worker nodes first** — least disruptive
2. **Control plane nodes last** — one at a time, verify etcd health between each
```bash
# Upgrade worker nodes
wild node upgrade worker-1 v1.11.5
# Wait for worker-1 to be Ready
wild node upgrade worker-2 v1.11.5
# Then control plane nodes (one at a time)
wild node upgrade control-1 v1.11.5
# Verify etcd and cluster health
wild cluster health
wild node upgrade control-2 v1.11.5
wild cluster health
wild node upgrade control-3 v1.11.5
```
## Rollback
Talos uses an A/B image scheme — the previous version is always available. If an upgrade causes problems:
```bash
wild node rollback <hostname>
```
This reverts the node to its previous Talos version and reboots.
## Upgrade talosctl
If the target Talos version requires a newer talosctl (the client must be within 1 minor version of the node), upgrade talosctl on Wild Central first:
```bash
# Check current talosctl version
wild talos client
# Upgrade talosctl
wild talos client upgrade <version>
```
## Update the Instance Schematic
To change the default Talos schematic and version for your instance (used when adding new nodes):
```bash
wild config set cluster.nodes.talos.version v1.11.5
wild config set cluster.nodes.talos.schematicId <new-schematic-id>
```
Or via the API:
```bash
curl -X PUT http://localhost:5055/api/v1/instances/{instance}/schematic \
-H "Content-Type: application/json" \
-d '{"schematicId": "<id>", "version": "v1.11.5"}'
```
## Troubleshooting
### Node stuck after upgrade
```bash
# Check Talos services
talosctl --talosconfig <talosconfig-path> services --nodes <node-ip>
# Check Talos logs
talosctl --talosconfig <talosconfig-path> logs kubelet --nodes <node-ip>
# If the node won't come back, rollback
wild node rollback <hostname>
```
### talosctl version mismatch
```
Error: talosctl version too old for target version
```
Upgrade talosctl first: `wild talos client upgrade <version>`
### Schematic not available for target version
Create a new schematic at [factory.talos.dev](https://factory.talos.dev) with the extensions you need for the new version, then use the new schematic ID:
```bash
wild node upgrade <hostname> <version> --schematic-id <new-id>
```
## Related Guides
- [Upgrade Kubernetes](upgrade-kubernetes.md) — Kubernetes version upgrades
- [Troubleshoot Cluster](troubleshoot-cluster.md) — When upgrades cause issues

View File

@@ -1,3 +1,70 @@
# Upgrade Wild Cloud
TBD
This guide covers upgrading Wild Cloud Central itself — the API, CLI, and web app that run on your Wild Central device.
## Check Current Version
```bash
wild version
```
This shows the CLI version and, if connected, the API version.
## Upgrade via apt
If Wild Cloud Central was installed via the `.deb` package:
```bash
# Download the latest .deb package from the releases page
# https://git.civilsociety.dev/wild-cloud/wild-cloud/releases
# Install the update
sudo dpkg -i wild-cloud-central_<version>_<arch>.deb
sudo apt-get install -f # Fix any dependency issues
# Restart the service
sudo systemctl restart wild-cloud-central
```
## Verify the Upgrade
```bash
# Check the service is running
sudo systemctl status wild-cloud-central
# Check the version
wild version
# Verify API is accessible
wild daemon status
```
## What Gets Upgraded
The Wild Cloud Central package includes:
- **Wild API** — the daemon that manages your instances
- **Wild CLI** — the `wild` command-line tool
- **Wild Web App** — the browser-based management interface
All three components share the same version number.
## Data Compatibility
Wild Cloud upgrades are backward-compatible with your instance data. Your `config.yaml`, `secrets.yaml`, compiled manifests, and Kubernetes state are not modified by the upgrade.
If a new version introduces new configuration fields, they will use defaults until you configure them.
## Downgrading
To downgrade, install the older `.deb` package:
```bash
sudo dpkg -i wild-cloud-central_<older-version>_<arch>.deb
sudo systemctl restart wild-cloud-central
```
## Related Guides
- [Upgrade Talos](upgrade-talos.md) — Upgrading the OS on cluster nodes
- [Upgrade Applications](upgrade-applications.md) — Upgrading deployed apps

View File

@@ -1,4 +1,4 @@
import { useState, useEffect } from 'react';
import { useState } from 'react';
import {
Dialog,
DialogContent,
@@ -9,16 +9,7 @@ import {
} from './ui/dialog';
import { Button } from './ui/button';
import { Label } from './ui/label';
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from './ui/select';
import { Loader2, AlertCircle, Clock, HardDrive, CheckCircle, Package } from 'lucide-react';
import { useDeployedApps } from '../hooks/useApps';
import { useAppBackups } from '../hooks/useBackups';
import { Loader2, AlertCircle, Clock, HardDrive, CheckCircle } from 'lucide-react';
interface Backup {
timestamp: string;
@@ -29,11 +20,11 @@ interface BackupRestoreModalProps {
isOpen: boolean;
onClose: () => void;
mode: 'backup' | 'restore';
appName?: string;
appName: string;
instanceName?: string;
backups?: Backup[];
isLoading?: boolean;
onConfirm: (backupId?: string, appName?: string) => void;
onConfirm: (backupId?: string) => void;
isPending?: boolean;
}
@@ -41,56 +32,25 @@ export function BackupRestoreModal({
isOpen,
onClose,
mode,
appName: initialAppName,
instanceName,
appName,
backups = [],
isLoading = false,
onConfirm,
isPending = false,
}: BackupRestoreModalProps) {
const [selectedBackupTimestamp, setSelectedBackupTimestamp] = useState<string | null>(null);
const [selectedApp, setSelectedApp] = useState<string>(initialAppName || '');
// For restore mode when no app is pre-selected
const { apps: deployedApps, isLoading: isLoadingApps } = useDeployedApps(
mode === 'restore' && !initialAppName ? instanceName : null
);
// Get backups for selected app
const { backups: appBackups, isLoading: isLoadingBackups } = useAppBackups(
mode === 'restore' && selectedApp ? instanceName : null,
selectedApp || null
);
// Update selected app when prop changes
useEffect(() => {
if (initialAppName) {
setSelectedApp(initialAppName);
}
}, [initialAppName]);
// Use provided backups or fetch them
const backupsToShow = initialAppName ? backups : (
appBackups?.filter(b => b.status === 'backed_up').map(b => ({
timestamp: b.timestamp,
size: undefined, // Size computed at call site
})) || []
);
const isLoadingData = isLoading || isLoadingApps || isLoadingBackups;
const handleConfirm = () => {
if (mode === 'backup') {
onConfirm();
} else if (mode === 'restore' && selectedBackupTimestamp && selectedApp) {
onConfirm(selectedBackupTimestamp, selectedApp);
} else if (mode === 'restore' && selectedBackupTimestamp) {
onConfirm(selectedBackupTimestamp);
}
onClose();
};
const formatTimestamp = (timestamp: string) => {
try {
// Handle format: 20260301T090145Z -> 2026-03-01T09:01:45Z
if (timestamp.match(/^\d{8}T\d{6}Z$/)) {
const formatted = timestamp.replace(
/^(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})(\d{2})Z$/,
@@ -98,14 +58,12 @@ export function BackupRestoreModal({
);
return new Date(formatted).toLocaleString();
}
// Try standard parsing for other formats
return new Date(timestamp).toLocaleString();
} catch {
return timestamp;
}
};
// Get relative time
const getRelativeTime = (timestamp: string) => {
try {
let date;
@@ -144,10 +102,8 @@ export function BackupRestoreModal({
</DialogTitle>
<DialogDescription>
{mode === 'backup'
? `Create a backup of the ${initialAppName} application data.`
: initialAppName
? `Select a backup to restore for ${initialAppName}.`
: 'Select an application and backup to restore.'}
? `Create a backup of the ${appName} application data.`
: `Select a backup to restore for ${appName}.`}
</DialogDescription>
</DialogHeader>
@@ -160,92 +116,54 @@ export function BackupRestoreModal({
</p>
</div>
) : (
<div className="space-y-4">
{/* App Selector (only when no app pre-selected) */}
{!initialAppName && (
<div className="space-y-2">
<Label htmlFor="app-select">Application</Label>
<Select value={selectedApp} onValueChange={setSelectedApp}>
<SelectTrigger id="app-select">
<SelectValue placeholder="Select an application" />
</SelectTrigger>
<SelectContent>
{isLoadingApps ? (
<div className="flex items-center justify-center p-2">
<Loader2 className="h-4 w-4 animate-spin" />
<span className="ml-2 text-sm">Loading apps...</span>
</div>
) : deployedApps?.length === 0 ? (
<div className="p-2 text-sm text-muted-foreground text-center">
No apps with backups
</div>
) : (
deployedApps?.filter((app: any) => app.status === 'deployed').map((app: any) => (
<SelectItem key={app.name} value={app.name}>
<div className="flex items-center gap-2">
<Package className="h-4 w-4" />
{app.name}
</div>
</SelectItem>
))
)}
</SelectContent>
</Select>
<div className="space-y-2">
<Label>Select Backup</Label>
{isLoading ? (
<div className="flex items-center justify-center py-8">
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
</div>
)}
{/* Backup List */}
{(selectedApp || initialAppName) && (
<div className="space-y-2">
<Label>Select Backup</Label>
{isLoadingData ? (
<div className="flex items-center justify-center py-8">
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
</div>
) : backupsToShow.length === 0 ? (
<div className="text-center py-8 bg-muted rounded-lg">
<AlertCircle className="h-12 w-12 text-muted-foreground mx-auto mb-4" />
<p className="text-sm font-medium">No backups available</p>
<p className="text-xs text-muted-foreground mt-1">
Create a backup first before you can restore
</p>
</div>
) : (
<div className="space-y-2 max-h-72 overflow-y-auto pr-2">
{backupsToShow.map((backup) => (
<button
key={backup.timestamp}
onClick={() => setSelectedBackupTimestamp(backup.timestamp)}
className={`w-full p-3 rounded-lg border text-left transition-all hover:shadow-md ${
selectedBackupTimestamp === backup.timestamp
? 'border-primary bg-primary/10 ring-2 ring-primary/20'
: 'border-border hover:bg-accent/50'
}`}
>
<div className="flex items-center justify-between mb-1">
<div className="flex items-center gap-2">
<Clock className="h-4 w-4 text-muted-foreground" />
<span className="text-sm font-medium">
{getRelativeTime(backup.timestamp)}
</span>
</div>
{selectedBackupTimestamp === backup.timestamp && (
<CheckCircle className="h-4 w-4 text-primary" />
)}
</div>
<div className="flex items-center gap-3 text-xs text-muted-foreground">
{backup.size && (
<span className="flex items-center gap-1">
<HardDrive className="h-3 w-3" />
{backup.size}
</span>
)}
<span>{formatTimestamp(backup.timestamp)}</span>
</div>
</button>
))}
</div>
)}
) : backups.length === 0 ? (
<div className="text-center py-8 bg-muted rounded-lg">
<AlertCircle className="h-12 w-12 text-muted-foreground mx-auto mb-4" />
<p className="text-sm font-medium">No backups available</p>
<p className="text-xs text-muted-foreground mt-1">
Create a backup first before you can restore
</p>
</div>
) : (
<div className="space-y-2 max-h-72 overflow-y-auto pr-2">
{backups.map((backup) => (
<button
key={backup.timestamp}
onClick={() => setSelectedBackupTimestamp(backup.timestamp)}
className={`w-full p-3 rounded-lg border text-left transition-all hover:shadow-md ${
selectedBackupTimestamp === backup.timestamp
? 'border-primary bg-primary/10 ring-2 ring-primary/20'
: 'border-border hover:bg-accent/50'
}`}
>
<div className="flex items-center justify-between mb-1">
<div className="flex items-center gap-2">
<Clock className="h-4 w-4 text-muted-foreground" />
<span className="text-sm font-medium">
{getRelativeTime(backup.timestamp)}
</span>
</div>
{selectedBackupTimestamp === backup.timestamp && (
<CheckCircle className="h-4 w-4 text-primary" />
)}
</div>
<div className="flex items-center gap-3 text-xs text-muted-foreground">
{backup.size && (
<span className="flex items-center gap-1">
<HardDrive className="h-3 w-3" />
{backup.size}
</span>
)}
<span>{formatTimestamp(backup.timestamp)}</span>
</div>
</button>
))}
</div>
)}
</div>
@@ -260,7 +178,7 @@ export function BackupRestoreModal({
onClick={handleConfirm}
disabled={
isPending ||
(mode === 'restore' && (!selectedBackupTimestamp || !selectedApp || backupsToShow.length === 0))
(mode === 'restore' && (!selectedBackupTimestamp || backups.length === 0))
}
>
{isPending ? (
@@ -278,4 +196,4 @@ export function BackupRestoreModal({
</DialogContent>
</Dialog>
);
}
}

View File

@@ -12,7 +12,7 @@ import {
DialogHeader,
DialogTitle,
} from './ui/dialog';
import { HardDrive, Settings, Clock, CheckCircle, BookOpen, ExternalLink, Loader2, AlertCircle, Database, FolderTree, Mail, Router, Edit2, Check, X, XCircle, Play, RotateCw, Copy, ChevronDown, ChevronUp, Edit, ArrowUpCircle, Terminal } from 'lucide-react';
import { HardDrive, Settings, CheckCircle, BookOpen, ExternalLink, Loader2, AlertCircle, FolderTree, Mail, Router, Edit2, Check, X, XCircle, Play, RotateCw, Copy, ChevronDown, ChevronUp, Edit, ArrowUpCircle, Terminal } from 'lucide-react';
import { Badge } from './ui/badge';
import { useCentralStatus } from '../hooks/useCentralStatus';
import { useInstanceConfig, useInstanceContext, useConfig } from '../hooks';
@@ -280,23 +280,6 @@ export function CentralComponent() {
),
});
const formatUptime = (seconds?: number) => {
if (!seconds) return 'Unknown';
const days = Math.floor(seconds / 86400);
const hours = Math.floor((seconds % 86400) / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
const parts = [];
if (days > 0) parts.push(`${days}d`);
if (hours > 0) parts.push(`${hours}h`);
if (minutes > 0) parts.push(`${minutes}m`);
if (secs > 0 || parts.length === 0) parts.push(`${secs}s`);
return parts.join(' ');
};
// Show error state
if (statusError) {
return (

View File

@@ -43,6 +43,7 @@ export function ClusterNodesComponent() {
updateNode,
applyNode,
isApplying,
rebootNode,
refetch
} = useNodes(currentInstance);
@@ -90,6 +91,7 @@ export function ClusterNodesComponent() {
const [detectError, setDetectError] = useState<string | null>(null);
const [discoverSuccess, setDiscoverSuccess] = useState<string | null>(null);
const [deleteNodeTarget, setDeleteNodeTarget] = useState<string | null>(null);
const [rebootNodeTarget, setRebootNodeTarget] = useState<string | null>(null);
const [showBootstrapModal, setShowBootstrapModal] = useState(false);
const [bootstrapNode, setBootstrapNode] = useState<{ name: string; ip: string } | null>(null);
const [drawerState, setDrawerState] = useState<{
@@ -285,6 +287,18 @@ export function ClusterNodesComponent() {
await deleteNode(hostname);
};
const handleRebootNode = (hostname: string) => {
setRebootNodeTarget(hostname);
};
const confirmRebootNode = async () => {
if (!rebootNodeTarget) return;
const hostname = rebootNodeTarget;
setRebootNodeTarget(null);
closeDrawer();
rebootNode(hostname);
};
const handleDiscover = () => {
setDiscoverError(null);
setDiscoverSuccess(null);
@@ -595,10 +609,30 @@ export function ClusterNodesComponent() {
onDelete={drawerState.mode === 'configure' && drawerState.node ? async () => {
handleDeleteNode(drawerState.node!.hostname);
} : undefined}
onReboot={drawerState.mode === 'configure' && drawerState.node ? () => {
handleRebootNode(drawerState.node!.hostname);
} : undefined}
instanceName={currentInstance || ''}
/>
)}
<AlertDialog open={!!rebootNodeTarget} onOpenChange={(open) => { if (!open) setRebootNodeTarget(null); }}>
<AlertDialogContent>
<AlertDialogHeader>
<AlertDialogTitle>Reboot node</AlertDialogTitle>
<AlertDialogDescription>
This will reboot node {rebootNodeTarget}. The node will restart and rejoin the cluster automatically. Running workloads on this node will be interrupted.
</AlertDialogDescription>
</AlertDialogHeader>
<AlertDialogFooter>
<AlertDialogCancel>Cancel</AlertDialogCancel>
<AlertDialogAction onClick={confirmRebootNode}>
Reboot
</AlertDialogAction>
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
<AlertDialog open={!!deleteNodeTarget} onOpenChange={(open) => { if (!open) setDeleteNodeTarget(null); }}>
<AlertDialogContent>
<AlertDialogHeader>

View File

@@ -36,6 +36,7 @@ import { useAppEnhanced, useAppReadme, useAppEvents, useAppLogs, useAppManifests
import { apiClient } from '@/services/api/client';
import { appsApi } from '@/services/api/apps';
import { operationsApi } from '@/services/api';
import type { BackupResourceInfo } from '@/services/api/backups';
interface AppDetailPanelProps {
instanceName: string;
@@ -79,7 +80,7 @@ export function AppDetailPanel({
pod?: string;
container?: string;
}>({ tail: 100 });
const [backupResources, setBackupResources] = useState<any[]>([]);
const [backupResources, setBackupResources] = useState<BackupResourceInfo[]>([]);
const [loadingBackupResources, setLoadingBackupResources] = useState(false);
const [hasLoadedBackupResources, setHasLoadedBackupResources] = useState(false);
const [activeTab, setActiveTab] = useState('overview');
@@ -213,7 +214,7 @@ export function AppDetailPanel({
setLoadingBackupResources(true);
apiClient.get(`/api/v1/instances/${instanceName}/apps/${appName}/backup/discover`)
.then((response) => {
const data = response as { data?: { resources?: any[] } };
const data = response as { data?: { resources?: BackupResourceInfo[] } };
// Handle both empty array and actual resources
const resources = data.data?.resources || [];
setBackupResources(resources);

View File

@@ -26,7 +26,7 @@ interface BackupDetailsModalProps {
backup: RecoveryPlan | null;
isOpen: boolean;
onClose: () => void;
onRestore: (backup: RecoveryPlan) => void;
onRestore?: (backup: RecoveryPlan) => void;
}
export function BackupDetailsModal({
@@ -222,7 +222,7 @@ export function BackupDetailsModal({
</div>
<DialogFooter className="gap-2">
{backup.status === 'backed_up' && (
{onRestore && backup.status === 'backed_up' && (
<Button
onClick={() => {
onRestore(backup);

Some files were not shown because too many files have changed in this diff Show More