Restore strategies.

This commit is contained in:
2026-05-25 23:09:39 +00:00
parent a533082388
commit c1ddf46f44
5 changed files with 592 additions and 50 deletions

View File

@@ -14,6 +14,11 @@ type ConfigItem struct {
Value interface{} `json:"value" yaml:"value"`
}
// ManifestBackupConfig declares backup behavior for an app
type ManifestBackupConfig struct {
RestoreMode string `json:"restoreMode,omitempty" yaml:"restoreMode,omitempty"` // "in-place" or "" (default: standby blue/green)
}
// AppManifest represents the complete app manifest from manifest.yaml
type AppManifest struct {
Name string `json:"name" yaml:"name"`
@@ -32,6 +37,7 @@ type AppManifest struct {
Scripts []Script `json:"scripts,omitempty" yaml:"scripts,omitempty"`
Deploy *DeployConfig `json:"deploy,omitempty" yaml:"deploy,omitempty"`
Upgrade *UpgradeConfig `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
Backup *ManifestBackupConfig `json:"backup,omitempty" yaml:"backup,omitempty"`
}
// DeployConfig declares deployment behavior in the manifest, replacing install.sh scripts

View File

@@ -188,12 +188,18 @@ func (m *Manager) BackupApp(instanceName, appName string) (*RecoveryPlan, error)
activeNamespace = btypes.ColoredName(appName, activeColor)
}
restoreMode := ""
if manifest.Backup != nil && manifest.Backup.RestoreMode == "in-place" {
restoreMode = "in-place"
}
plan := &RecoveryPlan{
App: appName,
Instance: instanceName,
Timestamp: timestamp,
Version: manifest.Version,
Status: "backing_up",
App: appName,
Instance: instanceName,
Timestamp: timestamp,
Version: manifest.Version,
Status: "backing_up",
RestoreMode: restoreMode,
Source: btypes.RecoverySource{
ActiveColor: activeColor,
Namespace: activeNamespace,
@@ -280,18 +286,26 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
// Compute standby targets
plan.Status = "restoring"
standbyNamespace := btypes.ColoredName(appName, plan.StandbyColor)
standbyAppDir := filepath.Join("instances", instanceName, "apps", standbyNamespace)
plan.Standby = btypes.RecoveryStandby{
Namespace: standbyNamespace,
AppDir: standbyAppDir,
if plan.RestoreMode == "in-place" {
// In-place: restore data to original namespace, no colored standby
plan.Standby = btypes.RecoveryStandby{
Namespace: plan.Source.Namespace,
AppDir: plan.Source.AppDir,
}
} else {
standbyNamespace := btypes.ColoredName(appName, plan.StandbyColor)
standbyAppDir := filepath.Join("instances", instanceName, "apps", standbyNamespace)
plan.Standby = btypes.RecoveryStandby{
Namespace: standbyNamespace,
AppDir: standbyAppDir,
}
}
now := time.Now()
plan.Phases["restore"] = PhaseTime{StartedAt: &now}
m.reportProgress(40, fmt.Sprintf("Restoring to %s namespace", standbyNamespace))
m.reportProgress(40, fmt.Sprintf("Restoring to %s namespace", plan.Standby.Namespace))
progressStart := 40
progressEnd := 80
@@ -325,13 +339,16 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
}
}
// Deploy standby namespace
m.reportProgress(85, "Deploying app to standby namespace")
if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
// For standby mode: deploy app to the colored standby namespace.
// For in-place mode: skip — the Longhorn Switch phase handles the data swap.
if plan.RestoreMode != "in-place" {
m.reportProgress(85, "Deploying app to standby namespace")
if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
plan.Status = "failed"
plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
}
}
plan.Status = "restored"
@@ -435,37 +452,41 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
}
}
// Remove previous active namespace
m.reportProgress(80, "Removing previous namespace")
previousNamespace := plan.Source.Namespace
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
if previousNamespace != "" && previousNamespace != appName {
// Delete colored namespaces entirely
deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
if output, err := deleteCmd.CombinedOutput(); err != nil {
slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
// For standby mode: remove the previous active namespace and app directory.
// For in-place mode: the Longhorn Switch phase already cleaned up the old PVC/PV/volume;
// the service's namespace is the same throughout, so nothing to delete here.
if plan.RestoreMode != "in-place" {
m.reportProgress(80, "Removing previous namespace")
previousNamespace := plan.Source.Namespace
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
if previousNamespace != "" && previousNamespace != appName {
// Delete colored namespaces entirely
deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
if output, err := deleteCmd.CombinedOutput(); err != nil {
slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
}
} else if previousNamespace == appName {
// For the bare namespace (first restore), scale deployments to zero
// instead of deleting — keeps the namespace for future non-restore deploys
scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
tools.WithKubeconfig(scaleCmd, kubeconfigPath)
if output, err := scaleCmd.CombinedOutput(); err != nil {
slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
}
}
} else if previousNamespace == appName {
// For the bare namespace (first restore), scale deployments to zero
// instead of deleting — keeps the namespace for future non-restore deploys
scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
tools.WithKubeconfig(scaleCmd, kubeconfigPath)
if output, err := scaleCmd.CombinedOutput(); err != nil {
slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
}
}
// Remove previous active app directory
previousAppDir := plan.Source.AppDir
if previousAppDir != "" {
absPath := previousAppDir
if !filepath.IsAbs(absPath) {
absPath = filepath.Join(m.dataDir, absPath)
}
// Only remove if it's a colored directory (not the bare app dir)
if strings.Contains(filepath.Base(absPath), "-") {
os.RemoveAll(absPath)
// Remove previous active app directory
previousAppDir := plan.Source.AppDir
if previousAppDir != "" {
absPath := previousAppDir
if !filepath.IsAbs(absPath) {
absPath = filepath.Join(m.dataDir, absPath)
}
// Only remove if it's a colored directory (not the bare app dir)
if strings.Contains(filepath.Base(absPath), "-") {
os.RemoveAll(absPath)
}
}
}

View File

@@ -808,6 +808,253 @@ func TestUpdatePVCVolumeBindingsNoLonghornStrategy(t *testing.T) {
assert.NoError(t, err)
}
func TestBackupAppInPlaceRestoreMode(t *testing.T) {
t.Run("sets RestoreMode to in-place when manifest declares it", func(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
appName := "postgres"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
appsDir := filepath.Join(instanceDir, "apps", appName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(appsDir, 0755))
require.NoError(t, os.MkdirAll(backupsDir, 0755))
manifestContent := `
name: postgres
description: PostgreSQL database
version: 1.0.0-2
backup:
restoreMode: in-place
defaultConfig:
namespace: postgres
`
require.NoError(t, os.WriteFile(filepath.Join(appsDir, "manifest.yaml"), []byte(manifestContent), 0644))
configContent := `
backup:
destination:
type: local
local:
path: ` + backupsDir + `
`
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
mgr := NewManager(tempDir)
mgr.strategies = map[string]Strategy{
"config": &MockStrategy{Name_: "config"},
}
plan, err := mgr.BackupApp(instanceName, appName)
require.NoError(t, err)
assert.Equal(t, "in-place", plan.RestoreMode)
})
t.Run("leaves RestoreMode empty when manifest has no backup block", func(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
appName := "gitea"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
appsDir := filepath.Join(instanceDir, "apps", appName)
backupsDir := filepath.Join(instanceDir, "backups")
require.NoError(t, os.MkdirAll(appsDir, 0755))
require.NoError(t, os.MkdirAll(backupsDir, 0755))
manifestContent := `
name: gitea
description: Gitea
version: 1.0.0
defaultConfig:
namespace: gitea
`
require.NoError(t, os.WriteFile(filepath.Join(appsDir, "manifest.yaml"), []byte(manifestContent), 0644))
configContent := `
backup:
destination:
type: local
local:
path: ` + backupsDir + `
`
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
mgr := NewManager(tempDir)
mgr.strategies = map[string]Strategy{
"config": &MockStrategy{Name_: "config"},
}
plan, err := mgr.BackupApp(instanceName, appName)
require.NoError(t, err)
assert.Equal(t, "", plan.RestoreMode)
})
}
func TestRestoreAppInPlace(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
appName := "postgres"
timestamp := "20240101T120000Z"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
require.NoError(t, os.MkdirAll(backupDir, 0755))
// Config with backup destination
configContent := `
backup:
destination:
type: local
local:
path: ` + filepath.Join(instanceDir, "backups") + `
`
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
// Saved plan with in-place mode and backed_up status
plan := &btypes.RecoveryPlan{
App: appName,
Instance: instanceName,
Timestamp: timestamp,
Status: "backed_up",
RestoreMode: "in-place",
Source: btypes.RecoverySource{
ActiveColor: "blue",
Namespace: appName,
AppDir: filepath.Join("instances", instanceName, "apps", appName),
},
StandbyColor: "green",
Strategies: []btypes.StrategyEntry{
{Name: "config", Status: "backed_up"},
},
Phases: map[string]btypes.PhaseTime{},
}
data, _ := yaml.Marshal(plan)
require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
mgr := NewManager(tempDir)
mgr.strategies = map[string]Strategy{
"config": &MockStrategy{Name_: "config"},
}
restored, err := mgr.RestoreApp(instanceName, appName, RestoreOptions{})
require.NoError(t, err)
assert.Equal(t, "restored", restored.Status)
// In-place: standby namespace is the source namespace (not a colored standby)
assert.Equal(t, appName, restored.Standby.Namespace)
assert.Equal(t, plan.Source.AppDir, restored.Standby.AppDir)
}
func TestSwitchAppInPlaceUpdatesActiveDeployment(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
appName := "postgres"
timestamp := "20240101T120000Z"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
require.NoError(t, os.MkdirAll(backupDir, 0755))
// Config with backup destination and existing app config
configContent := `
apps:
postgres:
namespace: postgres
backup:
destination:
type: local
local:
path: ` + filepath.Join(instanceDir, "backups") + `
`
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
// Saved plan in restored state with in-place mode
plan := &btypes.RecoveryPlan{
App: appName,
Instance: instanceName,
Timestamp: timestamp,
Status: "restored",
RestoreMode: "in-place",
StandbyColor: "green",
Source: btypes.RecoverySource{
ActiveColor: "blue",
Namespace: appName,
},
Standby: btypes.RecoveryStandby{
Namespace: appName,
},
Strategies: []btypes.StrategyEntry{
{Name: "config", Status: "restored"},
},
Phases: map[string]btypes.PhaseTime{},
}
data, _ := yaml.Marshal(plan)
require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
mgr := NewManager(tempDir)
mgr.strategies = map[string]Strategy{
"config": &MockStrategy{Name_: "config"},
}
switched, err := mgr.SwitchApp(instanceName, appName)
require.NoError(t, err)
assert.Equal(t, "switched", switched.Status)
// activeDeployment should be updated even for in-place (tracks color for next restore)
color := mgr.getActiveDeployment(instanceName, appName)
assert.Equal(t, "green", color)
}
func TestCleanupAppInPlaceSkipsNamespaceDeletion(t *testing.T) {
tempDir := t.TempDir()
instanceName := "test-instance"
appName := "postgres"
timestamp := "20240101T120000Z"
instanceDir := filepath.Join(tempDir, "instances", instanceName)
backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
require.NoError(t, os.MkdirAll(backupDir, 0755))
configContent := `
backup:
destination:
type: local
local:
path: ` + filepath.Join(instanceDir, "backups") + `
`
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
// Saved plan in switched state with in-place mode
plan := &btypes.RecoveryPlan{
App: appName,
Instance: instanceName,
Timestamp: timestamp,
Status: "switched",
RestoreMode: "in-place",
StandbyColor: "green",
Source: btypes.RecoverySource{
ActiveColor: "blue",
Namespace: appName,
},
Strategies: []btypes.StrategyEntry{
{Name: "config", Status: "switched"},
},
Phases: map[string]btypes.PhaseTime{},
}
data, _ := yaml.Marshal(plan)
require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
mgr := NewManager(tempDir)
mgr.strategies = map[string]Strategy{
"config": &MockStrategy{Name_: "config"},
}
// Should succeed without trying to run kubectl (no cluster access needed)
cleaned, err := mgr.CleanupApp(instanceName, appName)
require.NoError(t, err)
assert.Equal(t, "cleaned_up", cleaned.Status)
}
func TestIsDbNameEnvVar(t *testing.T) {
tests := []struct {
envName string

View File

@@ -203,7 +203,8 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
return nil
}
// Switch records previous active volume names
// Switch either records previous active volume names (standby mode) or performs
// an in-place PVC swap: scale down → delete old PVC/PV/volume → create new PV+PVC → scale up.
func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
entry := plan.GetStrategyEntry("longhorn-native")
if entry == nil {
@@ -211,6 +212,15 @@ func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
}
entry.Status = "switching"
if plan.RestoreMode == "in-place" {
if err := l.switchInPlace(plan, entry); err != nil {
return err
}
entry.Status = "switched"
return nil
}
// Standby mode: record previous active volume names for Cleanup phase
previousVolumes := []map[string]any{}
if volumeParams, ok := entry.Params["volumes"].([]any); ok {
for _, vp := range volumeParams {
@@ -230,7 +240,136 @@ func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
return nil
}
// Cleanup deletes the previous active-color Longhorn volumes
// switchInPlace performs a PVC swap in the original namespace:
// scale down → delete old PVC/PV/Longhorn volume → create new PV+PVC → scale up.
func (l *LonghornNativeStrategy) switchInPlace(plan *btypes.RecoveryPlan, entry *btypes.StrategyEntry) error {
kubeconfigPath := tools.GetKubeconfigPath(l.dataDir, plan.Instance)
namespace := plan.Source.Namespace
restoreVolumes, ok := entry.Restore["volumes"].([]any)
if !ok || len(restoreVolumes) == 0 {
// No PVCs to swap (e.g., memcached)
slog.Info("no volumes to swap for in-place restore, skipping PVC swap", "component", "longhorn", "namespace", namespace)
return l.bounceNamespace(kubeconfigPath, namespace)
}
// Build a map of pvcName → restored volume name for quick lookup
restoredVolumeByPVC := map[string]string{}
for _, rv := range restoreVolumes {
if rvMap, ok := rv.(map[string]any); ok {
pvcName, _ := rvMap["pvcName"].(string)
volumeName, _ := rvMap["volumeName"].(string)
if pvcName != "" && volumeName != "" {
restoredVolumeByPVC[pvcName] = volumeName
}
}
}
// Collect PVC metadata from Params for size/accessMode
pvcParams := map[string]map[string]any{}
if volumeParams, ok := entry.Params["volumes"].([]any); ok {
for _, vp := range volumeParams {
if vpMap, ok := vp.(map[string]any); ok {
if pvcName, ok := vpMap["pvcName"].(string); ok {
pvcParams[pvcName] = vpMap
}
}
}
}
// Step 1: Scale down all workloads in the namespace
slog.Info("scaling down workloads for in-place restore", "component", "longhorn", "namespace", namespace)
if err := l.scaleNamespace(kubeconfigPath, namespace, 0); err != nil {
return fmt.Errorf("failed to scale down namespace %s: %w", namespace, err)
}
if err := l.waitForPodsGone(kubeconfigPath, namespace); err != nil {
return fmt.Errorf("timed out waiting for pods to terminate in %s: %w", namespace, err)
}
// Step 2: For each PVC, record old PV/volume then swap
previousItems := []map[string]any{}
for pvcName, restoredVolumeName := range restoredVolumeByPVC {
// Get current PV name and Longhorn volume handle
pvName, longhornVolume, err := l.getPVInfo(kubeconfigPath, namespace, pvcName)
if err != nil {
slog.Error("failed to get PV info, skipping PVC", "component", "longhorn", "pvc", pvcName, "error", err)
}
previousItems = append(previousItems, map[string]any{
"pvcName": pvcName,
"pvName": pvName,
"longhornVolume": longhornVolume,
})
// Delete the PVC (may cascade to PV deletion if Delete reclaimPolicy)
slog.Info("deleting PVC for in-place swap", "component", "longhorn", "pvc", pvcName, "namespace", namespace)
deleteCmd := exec.Command("kubectl", "delete", "pvc", pvcName, "-n", namespace, "--ignore-not-found", "--timeout=60s")
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
if output, err := deleteCmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to delete PVC %s: %w, output: %s", pvcName, err, output)
}
// Clean up old PV if it still exists (Retain policy from a prior in-place restore)
if pvName != "" {
if err := l.deletePVIfExists(kubeconfigPath, pvName); err != nil {
slog.Error("failed to delete old PV", "component", "longhorn", "pv", pvName, "error", err)
}
}
// Clean up old Longhorn volume if it still exists and was a restore volume
// (colored name pattern: pvcName-blue or pvcName-green)
if longhornVolume != "" && (strings.HasSuffix(longhornVolume, "-blue") || strings.HasSuffix(longhornVolume, "-green")) {
if err := l.deleteLonghornVolumeIfExists(kubeconfigPath, longhornVolume); err != nil {
slog.Error("failed to delete old Longhorn volume", "component", "longhorn", "volume", longhornVolume, "error", err)
}
}
// Get size and access mode for the new PVC
size := "10Gi"
accessMode := "ReadWriteOnce"
if params, ok := pvcParams[pvcName]; ok {
if s, ok := params["size"].(string); ok && s != "" {
size = s
}
if m, ok := params["accessMode"].(string); ok && m != "" {
accessMode = m
}
}
// Create new PV pointing to the restored Longhorn volume
slog.Info("creating new PV for restored volume", "component", "longhorn", "volume", restoredVolumeName, "pvc", pvcName)
if err := l.createPVForVolume(kubeconfigPath, restoredVolumeName, size, accessMode, namespace, pvcName); err != nil {
return fmt.Errorf("failed to create PV for volume %s: %w", restoredVolumeName, err)
}
// Create new PVC bound to the new PV
slog.Info("creating new PVC bound to restored volume", "component", "longhorn", "pvc", pvcName, "volume", restoredVolumeName)
if err := l.createPVC(kubeconfigPath, pvcName, namespace, restoredVolumeName, size, accessMode); err != nil {
return fmt.Errorf("failed to create PVC %s: %w", pvcName, err)
}
// Wait for PVC to be Bound
if err := l.waitForPVCBound(kubeconfigPath, namespace, pvcName); err != nil {
return fmt.Errorf("PVC %s did not reach Bound state: %w", pvcName, err)
}
}
entry.Switch = map[string]any{
"previousItems": previousItems,
}
// Step 3: Scale back up
slog.Info("scaling workloads back up after in-place restore", "component", "longhorn", "namespace", namespace)
if err := l.scaleNamespace(kubeconfigPath, namespace, 1); err != nil {
return fmt.Errorf("failed to scale up namespace %s: %w", namespace, err)
}
return nil
}
// Cleanup deletes the previous active-color Longhorn volumes (standby mode).
// For in-place mode this is a no-op — the old PVC/PV/volume were cleaned up during Switch.
func (l *LonghornNativeStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
entry := plan.GetStrategyEntry("longhorn-native")
if entry == nil {
@@ -238,7 +377,8 @@ func (l *LonghornNativeStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
}
entry.Status = "cleaning_up"
// Skip automatic volume cleanup — volumes may still be referenced by PVCs.
// In-place: nothing to clean up here (handled during Switch)
// Standby: skip automatic volume cleanup — volumes may still be referenced by PVCs.
// Manual cleanup or a separate garbage collection process is safer.
entry.Status = "cleaned_up"
@@ -610,3 +750,130 @@ func (l *LonghornNativeStrategy) waitForVolume(kubeconfigPath, volumeName string
func (l *LonghornNativeStrategy) cleanupOldBackups(_, _, _ string) error {
return nil
}
// bounceNamespace scales a namespace down then back up (used for stateless in-place restores like memcached).
func (l *LonghornNativeStrategy) bounceNamespace(kubeconfigPath, namespace string) error {
if err := l.scaleNamespace(kubeconfigPath, namespace, 0); err != nil {
return fmt.Errorf("failed to scale down namespace %s: %w", namespace, err)
}
if err := l.waitForPodsGone(kubeconfigPath, namespace); err != nil {
return fmt.Errorf("timed out waiting for pods to terminate in %s: %w", namespace, err)
}
return l.scaleNamespace(kubeconfigPath, namespace, 1)
}
// scaleNamespace sets replicas on all Deployments and StatefulSets in a namespace.
func (l *LonghornNativeStrategy) scaleNamespace(kubeconfigPath, namespace string, replicas int) error {
for _, kind := range []string{"deployment", "statefulset"} {
cmd := exec.Command("kubectl", "scale", kind, "--all", "-n", namespace,
fmt.Sprintf("--replicas=%d", replicas))
tools.WithKubeconfig(cmd, kubeconfigPath)
if output, err := cmd.CombinedOutput(); err != nil {
if !strings.Contains(string(output), "no resources found") {
return fmt.Errorf("failed to scale %s in %s: %w, output: %s", kind, namespace, err, output)
}
}
}
return nil
}
// waitForPodsGone waits until all pods in a namespace have terminated.
func (l *LonghornNativeStrategy) waitForPodsGone(kubeconfigPath, namespace string) error {
cmd := exec.Command("kubectl", "wait", "--for=delete", "pod", "--all",
"-n", namespace, "--timeout=120s")
tools.WithKubeconfig(cmd, kubeconfigPath)
if output, err := cmd.CombinedOutput(); err != nil {
if strings.Contains(string(output), "no matching resources found") {
return nil
}
return fmt.Errorf("waiting for pods to terminate in %s: %w, output: %s", namespace, err, output)
}
return nil
}
// getPVInfo returns the PV name and Longhorn volume handle for a PVC.
func (l *LonghornNativeStrategy) getPVInfo(kubeconfigPath, namespace, pvcName string) (string, string, error) {
pvNameCmd := exec.Command("kubectl", "get", "pvc", pvcName, "-n", namespace,
"-o", "jsonpath={.spec.volumeName}")
tools.WithKubeconfig(pvNameCmd, kubeconfigPath)
pvNameOutput, err := pvNameCmd.Output()
if err != nil {
return "", "", fmt.Errorf("failed to get PV name for PVC %s: %w", pvcName, err)
}
pvName := strings.TrimSpace(string(pvNameOutput))
if pvName == "" {
return "", "", nil
}
handleCmd := exec.Command("kubectl", "get", "pv", pvName,
"-o", "jsonpath={.spec.csi.volumeHandle}")
tools.WithKubeconfig(handleCmd, kubeconfigPath)
handleOutput, err := handleCmd.Output()
if err != nil {
return pvName, "", fmt.Errorf("failed to get volume handle for PV %s: %w", pvName, err)
}
return pvName, strings.TrimSpace(string(handleOutput)), nil
}
// deletePVIfExists deletes a PersistentVolume if it exists (handles Retain reclaim policy cleanup).
func (l *LonghornNativeStrategy) deletePVIfExists(kubeconfigPath, pvName string) error {
cmd := exec.Command("kubectl", "delete", "pv", pvName, "--ignore-not-found", "--timeout=60s")
tools.WithKubeconfig(cmd, kubeconfigPath)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to delete PV %s: %w, output: %s", pvName, err, output)
}
return nil
}
// deleteLonghornVolumeIfExists deletes a Longhorn Volume CR if it exists.
func (l *LonghornNativeStrategy) deleteLonghornVolumeIfExists(kubeconfigPath, volumeName string) error {
cmd := exec.Command("kubectl", "delete", "volumes.longhorn.io", volumeName,
"-n", "longhorn-system", "--ignore-not-found", "--timeout=60s")
tools.WithKubeconfig(cmd, kubeconfigPath)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to delete Longhorn volume %s: %w, output: %s", volumeName, err, output)
}
return nil
}
// createPVC creates a PVC pre-bound to a specific PV by name.
func (l *LonghornNativeStrategy) createPVC(kubeconfigPath, pvcName, namespace, pvName, size, accessMode string) error {
pvcYAML := fmt.Sprintf(`apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: %s
namespace: %s
spec:
accessModes:
- %s
resources:
requests:
storage: %s
storageClassName: longhorn
volumeName: %s
`, pvcName, namespace, accessMode, size, pvName)
cmd := exec.Command("kubectl", "apply", "-f", "-")
tools.WithKubeconfig(cmd, kubeconfigPath)
cmd.Stdin = strings.NewReader(pvcYAML)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to create PVC %s: %w, output: %s", pvcName, err, output)
}
return nil
}
// waitForPVCBound polls until the PVC reaches Bound status.
func (l *LonghornNativeStrategy) waitForPVCBound(kubeconfigPath, namespace, pvcName string) error {
for range 60 {
cmd := exec.Command("kubectl", "get", "pvc", pvcName, "-n", namespace,
"-o", "jsonpath={.status.phase}")
tools.WithKubeconfig(cmd, kubeconfigPath)
if output, err := cmd.Output(); err == nil && strings.TrimSpace(string(output)) == "Bound" {
return nil
}
time.Sleep(2 * time.Second)
}
return fmt.Errorf("timeout waiting for PVC %s in %s to be Bound", pvcName, namespace)
}

View File

@@ -36,6 +36,7 @@ type RecoveryPlan struct {
Version string `yaml:"version" json:"version,omitempty"`
Status string `yaml:"status" json:"status"` // backing_up, backed_up, restoring, restored, switching, switched, cleaning_up, cleaned_up, failed
Error string `yaml:"error" json:"error,omitempty"`
RestoreMode string `yaml:"restoreMode,omitempty" json:"restoreMode,omitempty"` // "in-place" or "" (default: standby blue/green)
Source RecoverySource `yaml:"source" json:"source"`
StandbyColor string `yaml:"standbyColor" json:"standbyColor"`
Standby RecoveryStandby `yaml:"standby" json:"standby,omitempty"`