Restore strategies.
This commit is contained in:
@@ -14,6 +14,11 @@ type ConfigItem struct {
|
||||
Value interface{} `json:"value" yaml:"value"`
|
||||
}
|
||||
|
||||
// ManifestBackupConfig declares backup behavior for an app
|
||||
type ManifestBackupConfig struct {
|
||||
RestoreMode string `json:"restoreMode,omitempty" yaml:"restoreMode,omitempty"` // "in-place" or "" (default: standby blue/green)
|
||||
}
|
||||
|
||||
// AppManifest represents the complete app manifest from manifest.yaml
|
||||
type AppManifest struct {
|
||||
Name string `json:"name" yaml:"name"`
|
||||
@@ -32,6 +37,7 @@ type AppManifest struct {
|
||||
Scripts []Script `json:"scripts,omitempty" yaml:"scripts,omitempty"`
|
||||
Deploy *DeployConfig `json:"deploy,omitempty" yaml:"deploy,omitempty"`
|
||||
Upgrade *UpgradeConfig `json:"upgrade,omitempty" yaml:"upgrade,omitempty"`
|
||||
Backup *ManifestBackupConfig `json:"backup,omitempty" yaml:"backup,omitempty"`
|
||||
}
|
||||
|
||||
// DeployConfig declares deployment behavior in the manifest, replacing install.sh scripts
|
||||
|
||||
@@ -188,12 +188,18 @@ func (m *Manager) BackupApp(instanceName, appName string) (*RecoveryPlan, error)
|
||||
activeNamespace = btypes.ColoredName(appName, activeColor)
|
||||
}
|
||||
|
||||
restoreMode := ""
|
||||
if manifest.Backup != nil && manifest.Backup.RestoreMode == "in-place" {
|
||||
restoreMode = "in-place"
|
||||
}
|
||||
|
||||
plan := &RecoveryPlan{
|
||||
App: appName,
|
||||
Instance: instanceName,
|
||||
Timestamp: timestamp,
|
||||
Version: manifest.Version,
|
||||
Status: "backing_up",
|
||||
App: appName,
|
||||
Instance: instanceName,
|
||||
Timestamp: timestamp,
|
||||
Version: manifest.Version,
|
||||
Status: "backing_up",
|
||||
RestoreMode: restoreMode,
|
||||
Source: btypes.RecoverySource{
|
||||
ActiveColor: activeColor,
|
||||
Namespace: activeNamespace,
|
||||
@@ -280,18 +286,26 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
|
||||
|
||||
// Compute standby targets
|
||||
plan.Status = "restoring"
|
||||
standbyNamespace := btypes.ColoredName(appName, plan.StandbyColor)
|
||||
standbyAppDir := filepath.Join("instances", instanceName, "apps", standbyNamespace)
|
||||
|
||||
plan.Standby = btypes.RecoveryStandby{
|
||||
Namespace: standbyNamespace,
|
||||
AppDir: standbyAppDir,
|
||||
if plan.RestoreMode == "in-place" {
|
||||
// In-place: restore data to original namespace, no colored standby
|
||||
plan.Standby = btypes.RecoveryStandby{
|
||||
Namespace: plan.Source.Namespace,
|
||||
AppDir: plan.Source.AppDir,
|
||||
}
|
||||
} else {
|
||||
standbyNamespace := btypes.ColoredName(appName, plan.StandbyColor)
|
||||
standbyAppDir := filepath.Join("instances", instanceName, "apps", standbyNamespace)
|
||||
plan.Standby = btypes.RecoveryStandby{
|
||||
Namespace: standbyNamespace,
|
||||
AppDir: standbyAppDir,
|
||||
}
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
plan.Phases["restore"] = PhaseTime{StartedAt: &now}
|
||||
|
||||
m.reportProgress(40, fmt.Sprintf("Restoring to %s namespace", standbyNamespace))
|
||||
m.reportProgress(40, fmt.Sprintf("Restoring to %s namespace", plan.Standby.Namespace))
|
||||
|
||||
progressStart := 40
|
||||
progressEnd := 80
|
||||
@@ -325,13 +339,16 @@ func (m *Manager) RestoreApp(instanceName, appName string, opts RestoreOptions)
|
||||
}
|
||||
}
|
||||
|
||||
// Deploy standby namespace
|
||||
m.reportProgress(85, "Deploying app to standby namespace")
|
||||
if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
|
||||
plan.Status = "failed"
|
||||
plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
|
||||
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
|
||||
return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
|
||||
// For standby mode: deploy app to the colored standby namespace.
|
||||
// For in-place mode: skip — the Longhorn Switch phase handles the data swap.
|
||||
if plan.RestoreMode != "in-place" {
|
||||
m.reportProgress(85, "Deploying app to standby namespace")
|
||||
if err := m.deployToStandbyNamespace(instanceName, appName, plan); err != nil {
|
||||
plan.Status = "failed"
|
||||
plan.Error = fmt.Sprintf("deploy to standby failed: %v", err)
|
||||
_ = m.savePlan(instanceName, appName, plan.Timestamp, plan)
|
||||
return plan, fmt.Errorf("failed to deploy to standby namespace: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
plan.Status = "restored"
|
||||
@@ -435,37 +452,41 @@ func (m *Manager) CleanupApp(instanceName, appName string) (*RecoveryPlan, error
|
||||
}
|
||||
}
|
||||
|
||||
// Remove previous active namespace
|
||||
m.reportProgress(80, "Removing previous namespace")
|
||||
previousNamespace := plan.Source.Namespace
|
||||
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
|
||||
if previousNamespace != "" && previousNamespace != appName {
|
||||
// Delete colored namespaces entirely
|
||||
deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
|
||||
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
|
||||
if output, err := deleteCmd.CombinedOutput(); err != nil {
|
||||
slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
|
||||
// For standby mode: remove the previous active namespace and app directory.
|
||||
// For in-place mode: the Longhorn Switch phase already cleaned up the old PVC/PV/volume;
|
||||
// the service's namespace is the same throughout, so nothing to delete here.
|
||||
if plan.RestoreMode != "in-place" {
|
||||
m.reportProgress(80, "Removing previous namespace")
|
||||
previousNamespace := plan.Source.Namespace
|
||||
kubeconfigPath := tools.GetKubeconfigPath(m.dataDir, instanceName)
|
||||
if previousNamespace != "" && previousNamespace != appName {
|
||||
// Delete colored namespaces entirely
|
||||
deleteCmd := exec.Command("kubectl", "delete", "namespace", previousNamespace, "--ignore-not-found", "--timeout=30s")
|
||||
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
|
||||
if output, err := deleteCmd.CombinedOutput(); err != nil {
|
||||
slog.Error("failed to delete previous namespace", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
|
||||
}
|
||||
} else if previousNamespace == appName {
|
||||
// For the bare namespace (first restore), scale deployments to zero
|
||||
// instead of deleting — keeps the namespace for future non-restore deploys
|
||||
scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
|
||||
tools.WithKubeconfig(scaleCmd, kubeconfigPath)
|
||||
if output, err := scaleCmd.CombinedOutput(); err != nil {
|
||||
slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
|
||||
}
|
||||
}
|
||||
} else if previousNamespace == appName {
|
||||
// For the bare namespace (first restore), scale deployments to zero
|
||||
// instead of deleting — keeps the namespace for future non-restore deploys
|
||||
scaleCmd := exec.Command("kubectl", "scale", "deployment", "--all", "--replicas=0", "-n", previousNamespace)
|
||||
tools.WithKubeconfig(scaleCmd, kubeconfigPath)
|
||||
if output, err := scaleCmd.CombinedOutput(); err != nil {
|
||||
slog.Error("failed to scale down previous deployments", "component", "backup", "namespace", previousNamespace, "error", err, "output", string(output))
|
||||
}
|
||||
}
|
||||
|
||||
// Remove previous active app directory
|
||||
previousAppDir := plan.Source.AppDir
|
||||
if previousAppDir != "" {
|
||||
absPath := previousAppDir
|
||||
if !filepath.IsAbs(absPath) {
|
||||
absPath = filepath.Join(m.dataDir, absPath)
|
||||
}
|
||||
// Only remove if it's a colored directory (not the bare app dir)
|
||||
if strings.Contains(filepath.Base(absPath), "-") {
|
||||
os.RemoveAll(absPath)
|
||||
// Remove previous active app directory
|
||||
previousAppDir := plan.Source.AppDir
|
||||
if previousAppDir != "" {
|
||||
absPath := previousAppDir
|
||||
if !filepath.IsAbs(absPath) {
|
||||
absPath = filepath.Join(m.dataDir, absPath)
|
||||
}
|
||||
// Only remove if it's a colored directory (not the bare app dir)
|
||||
if strings.Contains(filepath.Base(absPath), "-") {
|
||||
os.RemoveAll(absPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -808,6 +808,253 @@ func TestUpdatePVCVolumeBindingsNoLonghornStrategy(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestBackupAppInPlaceRestoreMode(t *testing.T) {
|
||||
t.Run("sets RestoreMode to in-place when manifest declares it", func(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
instanceName := "test-instance"
|
||||
appName := "postgres"
|
||||
instanceDir := filepath.Join(tempDir, "instances", instanceName)
|
||||
appsDir := filepath.Join(instanceDir, "apps", appName)
|
||||
backupsDir := filepath.Join(instanceDir, "backups")
|
||||
require.NoError(t, os.MkdirAll(appsDir, 0755))
|
||||
require.NoError(t, os.MkdirAll(backupsDir, 0755))
|
||||
|
||||
manifestContent := `
|
||||
name: postgres
|
||||
description: PostgreSQL database
|
||||
version: 1.0.0-2
|
||||
backup:
|
||||
restoreMode: in-place
|
||||
defaultConfig:
|
||||
namespace: postgres
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(appsDir, "manifest.yaml"), []byte(manifestContent), 0644))
|
||||
|
||||
configContent := `
|
||||
backup:
|
||||
destination:
|
||||
type: local
|
||||
local:
|
||||
path: ` + backupsDir + `
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
|
||||
|
||||
mgr := NewManager(tempDir)
|
||||
mgr.strategies = map[string]Strategy{
|
||||
"config": &MockStrategy{Name_: "config"},
|
||||
}
|
||||
|
||||
plan, err := mgr.BackupApp(instanceName, appName)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "in-place", plan.RestoreMode)
|
||||
})
|
||||
|
||||
t.Run("leaves RestoreMode empty when manifest has no backup block", func(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
instanceName := "test-instance"
|
||||
appName := "gitea"
|
||||
instanceDir := filepath.Join(tempDir, "instances", instanceName)
|
||||
appsDir := filepath.Join(instanceDir, "apps", appName)
|
||||
backupsDir := filepath.Join(instanceDir, "backups")
|
||||
require.NoError(t, os.MkdirAll(appsDir, 0755))
|
||||
require.NoError(t, os.MkdirAll(backupsDir, 0755))
|
||||
|
||||
manifestContent := `
|
||||
name: gitea
|
||||
description: Gitea
|
||||
version: 1.0.0
|
||||
defaultConfig:
|
||||
namespace: gitea
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(appsDir, "manifest.yaml"), []byte(manifestContent), 0644))
|
||||
|
||||
configContent := `
|
||||
backup:
|
||||
destination:
|
||||
type: local
|
||||
local:
|
||||
path: ` + backupsDir + `
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
|
||||
|
||||
mgr := NewManager(tempDir)
|
||||
mgr.strategies = map[string]Strategy{
|
||||
"config": &MockStrategy{Name_: "config"},
|
||||
}
|
||||
|
||||
plan, err := mgr.BackupApp(instanceName, appName)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "", plan.RestoreMode)
|
||||
})
|
||||
}
|
||||
|
||||
func TestRestoreAppInPlace(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
instanceName := "test-instance"
|
||||
appName := "postgres"
|
||||
timestamp := "20240101T120000Z"
|
||||
|
||||
instanceDir := filepath.Join(tempDir, "instances", instanceName)
|
||||
backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
|
||||
require.NoError(t, os.MkdirAll(backupDir, 0755))
|
||||
|
||||
// Config with backup destination
|
||||
configContent := `
|
||||
backup:
|
||||
destination:
|
||||
type: local
|
||||
local:
|
||||
path: ` + filepath.Join(instanceDir, "backups") + `
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
|
||||
|
||||
// Saved plan with in-place mode and backed_up status
|
||||
plan := &btypes.RecoveryPlan{
|
||||
App: appName,
|
||||
Instance: instanceName,
|
||||
Timestamp: timestamp,
|
||||
Status: "backed_up",
|
||||
RestoreMode: "in-place",
|
||||
Source: btypes.RecoverySource{
|
||||
ActiveColor: "blue",
|
||||
Namespace: appName,
|
||||
AppDir: filepath.Join("instances", instanceName, "apps", appName),
|
||||
},
|
||||
StandbyColor: "green",
|
||||
Strategies: []btypes.StrategyEntry{
|
||||
{Name: "config", Status: "backed_up"},
|
||||
},
|
||||
Phases: map[string]btypes.PhaseTime{},
|
||||
}
|
||||
|
||||
data, _ := yaml.Marshal(plan)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
|
||||
|
||||
mgr := NewManager(tempDir)
|
||||
mgr.strategies = map[string]Strategy{
|
||||
"config": &MockStrategy{Name_: "config"},
|
||||
}
|
||||
|
||||
restored, err := mgr.RestoreApp(instanceName, appName, RestoreOptions{})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "restored", restored.Status)
|
||||
// In-place: standby namespace is the source namespace (not a colored standby)
|
||||
assert.Equal(t, appName, restored.Standby.Namespace)
|
||||
assert.Equal(t, plan.Source.AppDir, restored.Standby.AppDir)
|
||||
}
|
||||
|
||||
func TestSwitchAppInPlaceUpdatesActiveDeployment(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
instanceName := "test-instance"
|
||||
appName := "postgres"
|
||||
timestamp := "20240101T120000Z"
|
||||
|
||||
instanceDir := filepath.Join(tempDir, "instances", instanceName)
|
||||
backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
|
||||
require.NoError(t, os.MkdirAll(backupDir, 0755))
|
||||
|
||||
// Config with backup destination and existing app config
|
||||
configContent := `
|
||||
apps:
|
||||
postgres:
|
||||
namespace: postgres
|
||||
backup:
|
||||
destination:
|
||||
type: local
|
||||
local:
|
||||
path: ` + filepath.Join(instanceDir, "backups") + `
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
|
||||
|
||||
// Saved plan in restored state with in-place mode
|
||||
plan := &btypes.RecoveryPlan{
|
||||
App: appName,
|
||||
Instance: instanceName,
|
||||
Timestamp: timestamp,
|
||||
Status: "restored",
|
||||
RestoreMode: "in-place",
|
||||
StandbyColor: "green",
|
||||
Source: btypes.RecoverySource{
|
||||
ActiveColor: "blue",
|
||||
Namespace: appName,
|
||||
},
|
||||
Standby: btypes.RecoveryStandby{
|
||||
Namespace: appName,
|
||||
},
|
||||
Strategies: []btypes.StrategyEntry{
|
||||
{Name: "config", Status: "restored"},
|
||||
},
|
||||
Phases: map[string]btypes.PhaseTime{},
|
||||
}
|
||||
|
||||
data, _ := yaml.Marshal(plan)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
|
||||
|
||||
mgr := NewManager(tempDir)
|
||||
mgr.strategies = map[string]Strategy{
|
||||
"config": &MockStrategy{Name_: "config"},
|
||||
}
|
||||
|
||||
switched, err := mgr.SwitchApp(instanceName, appName)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "switched", switched.Status)
|
||||
|
||||
// activeDeployment should be updated even for in-place (tracks color for next restore)
|
||||
color := mgr.getActiveDeployment(instanceName, appName)
|
||||
assert.Equal(t, "green", color)
|
||||
}
|
||||
|
||||
func TestCleanupAppInPlaceSkipsNamespaceDeletion(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
instanceName := "test-instance"
|
||||
appName := "postgres"
|
||||
timestamp := "20240101T120000Z"
|
||||
|
||||
instanceDir := filepath.Join(tempDir, "instances", instanceName)
|
||||
backupDir := filepath.Join(instanceDir, "backups", appName, timestamp)
|
||||
require.NoError(t, os.MkdirAll(backupDir, 0755))
|
||||
|
||||
configContent := `
|
||||
backup:
|
||||
destination:
|
||||
type: local
|
||||
local:
|
||||
path: ` + filepath.Join(instanceDir, "backups") + `
|
||||
`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(instanceDir, "config.yaml"), []byte(configContent), 0644))
|
||||
|
||||
// Saved plan in switched state with in-place mode
|
||||
plan := &btypes.RecoveryPlan{
|
||||
App: appName,
|
||||
Instance: instanceName,
|
||||
Timestamp: timestamp,
|
||||
Status: "switched",
|
||||
RestoreMode: "in-place",
|
||||
StandbyColor: "green",
|
||||
Source: btypes.RecoverySource{
|
||||
ActiveColor: "blue",
|
||||
Namespace: appName,
|
||||
},
|
||||
Strategies: []btypes.StrategyEntry{
|
||||
{Name: "config", Status: "switched"},
|
||||
},
|
||||
Phases: map[string]btypes.PhaseTime{},
|
||||
}
|
||||
|
||||
data, _ := yaml.Marshal(plan)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(backupDir, "recovery-plan.yaml"), data, 0600))
|
||||
|
||||
mgr := NewManager(tempDir)
|
||||
mgr.strategies = map[string]Strategy{
|
||||
"config": &MockStrategy{Name_: "config"},
|
||||
}
|
||||
|
||||
// Should succeed without trying to run kubectl (no cluster access needed)
|
||||
cleaned, err := mgr.CleanupApp(instanceName, appName)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "cleaned_up", cleaned.Status)
|
||||
}
|
||||
|
||||
func TestIsDbNameEnvVar(t *testing.T) {
|
||||
tests := []struct {
|
||||
envName string
|
||||
|
||||
@@ -203,7 +203,8 @@ func (l *LonghornNativeStrategy) Restore(plan *btypes.RecoveryPlan, dest btypes.
|
||||
return nil
|
||||
}
|
||||
|
||||
// Switch records previous active volume names
|
||||
// Switch either records previous active volume names (standby mode) or performs
|
||||
// an in-place PVC swap: scale down → delete old PVC/PV/volume → create new PV+PVC → scale up.
|
||||
func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
|
||||
entry := plan.GetStrategyEntry("longhorn-native")
|
||||
if entry == nil {
|
||||
@@ -211,6 +212,15 @@ func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
|
||||
}
|
||||
entry.Status = "switching"
|
||||
|
||||
if plan.RestoreMode == "in-place" {
|
||||
if err := l.switchInPlace(plan, entry); err != nil {
|
||||
return err
|
||||
}
|
||||
entry.Status = "switched"
|
||||
return nil
|
||||
}
|
||||
|
||||
// Standby mode: record previous active volume names for Cleanup phase
|
||||
previousVolumes := []map[string]any{}
|
||||
if volumeParams, ok := entry.Params["volumes"].([]any); ok {
|
||||
for _, vp := range volumeParams {
|
||||
@@ -230,7 +240,136 @@ func (l *LonghornNativeStrategy) Switch(plan *btypes.RecoveryPlan) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Cleanup deletes the previous active-color Longhorn volumes
|
||||
// switchInPlace performs a PVC swap in the original namespace:
|
||||
// scale down → delete old PVC/PV/Longhorn volume → create new PV+PVC → scale up.
|
||||
func (l *LonghornNativeStrategy) switchInPlace(plan *btypes.RecoveryPlan, entry *btypes.StrategyEntry) error {
|
||||
kubeconfigPath := tools.GetKubeconfigPath(l.dataDir, plan.Instance)
|
||||
namespace := plan.Source.Namespace
|
||||
|
||||
restoreVolumes, ok := entry.Restore["volumes"].([]any)
|
||||
if !ok || len(restoreVolumes) == 0 {
|
||||
// No PVCs to swap (e.g., memcached)
|
||||
slog.Info("no volumes to swap for in-place restore, skipping PVC swap", "component", "longhorn", "namespace", namespace)
|
||||
return l.bounceNamespace(kubeconfigPath, namespace)
|
||||
}
|
||||
|
||||
// Build a map of pvcName → restored volume name for quick lookup
|
||||
restoredVolumeByPVC := map[string]string{}
|
||||
for _, rv := range restoreVolumes {
|
||||
if rvMap, ok := rv.(map[string]any); ok {
|
||||
pvcName, _ := rvMap["pvcName"].(string)
|
||||
volumeName, _ := rvMap["volumeName"].(string)
|
||||
if pvcName != "" && volumeName != "" {
|
||||
restoredVolumeByPVC[pvcName] = volumeName
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect PVC metadata from Params for size/accessMode
|
||||
pvcParams := map[string]map[string]any{}
|
||||
if volumeParams, ok := entry.Params["volumes"].([]any); ok {
|
||||
for _, vp := range volumeParams {
|
||||
if vpMap, ok := vp.(map[string]any); ok {
|
||||
if pvcName, ok := vpMap["pvcName"].(string); ok {
|
||||
pvcParams[pvcName] = vpMap
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: Scale down all workloads in the namespace
|
||||
slog.Info("scaling down workloads for in-place restore", "component", "longhorn", "namespace", namespace)
|
||||
if err := l.scaleNamespace(kubeconfigPath, namespace, 0); err != nil {
|
||||
return fmt.Errorf("failed to scale down namespace %s: %w", namespace, err)
|
||||
}
|
||||
if err := l.waitForPodsGone(kubeconfigPath, namespace); err != nil {
|
||||
return fmt.Errorf("timed out waiting for pods to terminate in %s: %w", namespace, err)
|
||||
}
|
||||
|
||||
// Step 2: For each PVC, record old PV/volume then swap
|
||||
previousItems := []map[string]any{}
|
||||
|
||||
for pvcName, restoredVolumeName := range restoredVolumeByPVC {
|
||||
// Get current PV name and Longhorn volume handle
|
||||
pvName, longhornVolume, err := l.getPVInfo(kubeconfigPath, namespace, pvcName)
|
||||
if err != nil {
|
||||
slog.Error("failed to get PV info, skipping PVC", "component", "longhorn", "pvc", pvcName, "error", err)
|
||||
}
|
||||
|
||||
previousItems = append(previousItems, map[string]any{
|
||||
"pvcName": pvcName,
|
||||
"pvName": pvName,
|
||||
"longhornVolume": longhornVolume,
|
||||
})
|
||||
|
||||
// Delete the PVC (may cascade to PV deletion if Delete reclaimPolicy)
|
||||
slog.Info("deleting PVC for in-place swap", "component", "longhorn", "pvc", pvcName, "namespace", namespace)
|
||||
deleteCmd := exec.Command("kubectl", "delete", "pvc", pvcName, "-n", namespace, "--ignore-not-found", "--timeout=60s")
|
||||
tools.WithKubeconfig(deleteCmd, kubeconfigPath)
|
||||
if output, err := deleteCmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("failed to delete PVC %s: %w, output: %s", pvcName, err, output)
|
||||
}
|
||||
|
||||
// Clean up old PV if it still exists (Retain policy from a prior in-place restore)
|
||||
if pvName != "" {
|
||||
if err := l.deletePVIfExists(kubeconfigPath, pvName); err != nil {
|
||||
slog.Error("failed to delete old PV", "component", "longhorn", "pv", pvName, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up old Longhorn volume if it still exists and was a restore volume
|
||||
// (colored name pattern: pvcName-blue or pvcName-green)
|
||||
if longhornVolume != "" && (strings.HasSuffix(longhornVolume, "-blue") || strings.HasSuffix(longhornVolume, "-green")) {
|
||||
if err := l.deleteLonghornVolumeIfExists(kubeconfigPath, longhornVolume); err != nil {
|
||||
slog.Error("failed to delete old Longhorn volume", "component", "longhorn", "volume", longhornVolume, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Get size and access mode for the new PVC
|
||||
size := "10Gi"
|
||||
accessMode := "ReadWriteOnce"
|
||||
if params, ok := pvcParams[pvcName]; ok {
|
||||
if s, ok := params["size"].(string); ok && s != "" {
|
||||
size = s
|
||||
}
|
||||
if m, ok := params["accessMode"].(string); ok && m != "" {
|
||||
accessMode = m
|
||||
}
|
||||
}
|
||||
|
||||
// Create new PV pointing to the restored Longhorn volume
|
||||
slog.Info("creating new PV for restored volume", "component", "longhorn", "volume", restoredVolumeName, "pvc", pvcName)
|
||||
if err := l.createPVForVolume(kubeconfigPath, restoredVolumeName, size, accessMode, namespace, pvcName); err != nil {
|
||||
return fmt.Errorf("failed to create PV for volume %s: %w", restoredVolumeName, err)
|
||||
}
|
||||
|
||||
// Create new PVC bound to the new PV
|
||||
slog.Info("creating new PVC bound to restored volume", "component", "longhorn", "pvc", pvcName, "volume", restoredVolumeName)
|
||||
if err := l.createPVC(kubeconfigPath, pvcName, namespace, restoredVolumeName, size, accessMode); err != nil {
|
||||
return fmt.Errorf("failed to create PVC %s: %w", pvcName, err)
|
||||
}
|
||||
|
||||
// Wait for PVC to be Bound
|
||||
if err := l.waitForPVCBound(kubeconfigPath, namespace, pvcName); err != nil {
|
||||
return fmt.Errorf("PVC %s did not reach Bound state: %w", pvcName, err)
|
||||
}
|
||||
}
|
||||
|
||||
entry.Switch = map[string]any{
|
||||
"previousItems": previousItems,
|
||||
}
|
||||
|
||||
// Step 3: Scale back up
|
||||
slog.Info("scaling workloads back up after in-place restore", "component", "longhorn", "namespace", namespace)
|
||||
if err := l.scaleNamespace(kubeconfigPath, namespace, 1); err != nil {
|
||||
return fmt.Errorf("failed to scale up namespace %s: %w", namespace, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Cleanup deletes the previous active-color Longhorn volumes (standby mode).
|
||||
// For in-place mode this is a no-op — the old PVC/PV/volume were cleaned up during Switch.
|
||||
func (l *LonghornNativeStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
|
||||
entry := plan.GetStrategyEntry("longhorn-native")
|
||||
if entry == nil {
|
||||
@@ -238,7 +377,8 @@ func (l *LonghornNativeStrategy) Cleanup(plan *btypes.RecoveryPlan) error {
|
||||
}
|
||||
entry.Status = "cleaning_up"
|
||||
|
||||
// Skip automatic volume cleanup — volumes may still be referenced by PVCs.
|
||||
// In-place: nothing to clean up here (handled during Switch)
|
||||
// Standby: skip automatic volume cleanup — volumes may still be referenced by PVCs.
|
||||
// Manual cleanup or a separate garbage collection process is safer.
|
||||
|
||||
entry.Status = "cleaned_up"
|
||||
@@ -610,3 +750,130 @@ func (l *LonghornNativeStrategy) waitForVolume(kubeconfigPath, volumeName string
|
||||
func (l *LonghornNativeStrategy) cleanupOldBackups(_, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// bounceNamespace scales a namespace down then back up (used for stateless in-place restores like memcached).
|
||||
func (l *LonghornNativeStrategy) bounceNamespace(kubeconfigPath, namespace string) error {
|
||||
if err := l.scaleNamespace(kubeconfigPath, namespace, 0); err != nil {
|
||||
return fmt.Errorf("failed to scale down namespace %s: %w", namespace, err)
|
||||
}
|
||||
if err := l.waitForPodsGone(kubeconfigPath, namespace); err != nil {
|
||||
return fmt.Errorf("timed out waiting for pods to terminate in %s: %w", namespace, err)
|
||||
}
|
||||
return l.scaleNamespace(kubeconfigPath, namespace, 1)
|
||||
}
|
||||
|
||||
// scaleNamespace sets replicas on all Deployments and StatefulSets in a namespace.
|
||||
func (l *LonghornNativeStrategy) scaleNamespace(kubeconfigPath, namespace string, replicas int) error {
|
||||
for _, kind := range []string{"deployment", "statefulset"} {
|
||||
cmd := exec.Command("kubectl", "scale", kind, "--all", "-n", namespace,
|
||||
fmt.Sprintf("--replicas=%d", replicas))
|
||||
tools.WithKubeconfig(cmd, kubeconfigPath)
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
if !strings.Contains(string(output), "no resources found") {
|
||||
return fmt.Errorf("failed to scale %s in %s: %w, output: %s", kind, namespace, err, output)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitForPodsGone waits until all pods in a namespace have terminated.
|
||||
func (l *LonghornNativeStrategy) waitForPodsGone(kubeconfigPath, namespace string) error {
|
||||
cmd := exec.Command("kubectl", "wait", "--for=delete", "pod", "--all",
|
||||
"-n", namespace, "--timeout=120s")
|
||||
tools.WithKubeconfig(cmd, kubeconfigPath)
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
if strings.Contains(string(output), "no matching resources found") {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("waiting for pods to terminate in %s: %w, output: %s", namespace, err, output)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getPVInfo returns the PV name and Longhorn volume handle for a PVC.
|
||||
func (l *LonghornNativeStrategy) getPVInfo(kubeconfigPath, namespace, pvcName string) (string, string, error) {
|
||||
pvNameCmd := exec.Command("kubectl", "get", "pvc", pvcName, "-n", namespace,
|
||||
"-o", "jsonpath={.spec.volumeName}")
|
||||
tools.WithKubeconfig(pvNameCmd, kubeconfigPath)
|
||||
pvNameOutput, err := pvNameCmd.Output()
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("failed to get PV name for PVC %s: %w", pvcName, err)
|
||||
}
|
||||
pvName := strings.TrimSpace(string(pvNameOutput))
|
||||
if pvName == "" {
|
||||
return "", "", nil
|
||||
}
|
||||
|
||||
handleCmd := exec.Command("kubectl", "get", "pv", pvName,
|
||||
"-o", "jsonpath={.spec.csi.volumeHandle}")
|
||||
tools.WithKubeconfig(handleCmd, kubeconfigPath)
|
||||
handleOutput, err := handleCmd.Output()
|
||||
if err != nil {
|
||||
return pvName, "", fmt.Errorf("failed to get volume handle for PV %s: %w", pvName, err)
|
||||
}
|
||||
|
||||
return pvName, strings.TrimSpace(string(handleOutput)), nil
|
||||
}
|
||||
|
||||
// deletePVIfExists deletes a PersistentVolume if it exists (handles Retain reclaim policy cleanup).
|
||||
func (l *LonghornNativeStrategy) deletePVIfExists(kubeconfigPath, pvName string) error {
|
||||
cmd := exec.Command("kubectl", "delete", "pv", pvName, "--ignore-not-found", "--timeout=60s")
|
||||
tools.WithKubeconfig(cmd, kubeconfigPath)
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("failed to delete PV %s: %w, output: %s", pvName, err, output)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// deleteLonghornVolumeIfExists deletes a Longhorn Volume CR if it exists.
|
||||
func (l *LonghornNativeStrategy) deleteLonghornVolumeIfExists(kubeconfigPath, volumeName string) error {
|
||||
cmd := exec.Command("kubectl", "delete", "volumes.longhorn.io", volumeName,
|
||||
"-n", "longhorn-system", "--ignore-not-found", "--timeout=60s")
|
||||
tools.WithKubeconfig(cmd, kubeconfigPath)
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("failed to delete Longhorn volume %s: %w, output: %s", volumeName, err, output)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// createPVC creates a PVC pre-bound to a specific PV by name.
|
||||
func (l *LonghornNativeStrategy) createPVC(kubeconfigPath, pvcName, namespace, pvName, size, accessMode string) error {
|
||||
pvcYAML := fmt.Sprintf(`apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: %s
|
||||
namespace: %s
|
||||
spec:
|
||||
accessModes:
|
||||
- %s
|
||||
resources:
|
||||
requests:
|
||||
storage: %s
|
||||
storageClassName: longhorn
|
||||
volumeName: %s
|
||||
`, pvcName, namespace, accessMode, size, pvName)
|
||||
|
||||
cmd := exec.Command("kubectl", "apply", "-f", "-")
|
||||
tools.WithKubeconfig(cmd, kubeconfigPath)
|
||||
cmd.Stdin = strings.NewReader(pvcYAML)
|
||||
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("failed to create PVC %s: %w, output: %s", pvcName, err, output)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitForPVCBound polls until the PVC reaches Bound status.
|
||||
func (l *LonghornNativeStrategy) waitForPVCBound(kubeconfigPath, namespace, pvcName string) error {
|
||||
for range 60 {
|
||||
cmd := exec.Command("kubectl", "get", "pvc", pvcName, "-n", namespace,
|
||||
"-o", "jsonpath={.status.phase}")
|
||||
tools.WithKubeconfig(cmd, kubeconfigPath)
|
||||
if output, err := cmd.Output(); err == nil && strings.TrimSpace(string(output)) == "Bound" {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
return fmt.Errorf("timeout waiting for PVC %s in %s to be Bound", pvcName, namespace)
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ type RecoveryPlan struct {
|
||||
Version string `yaml:"version" json:"version,omitempty"`
|
||||
Status string `yaml:"status" json:"status"` // backing_up, backed_up, restoring, restored, switching, switched, cleaning_up, cleaned_up, failed
|
||||
Error string `yaml:"error" json:"error,omitempty"`
|
||||
RestoreMode string `yaml:"restoreMode,omitempty" json:"restoreMode,omitempty"` // "in-place" or "" (default: standby blue/green)
|
||||
Source RecoverySource `yaml:"source" json:"source"`
|
||||
StandbyColor string `yaml:"standbyColor" json:"standbyColor"`
|
||||
Standby RecoveryStandby `yaml:"standby" json:"standby,omitempty"`
|
||||
|
||||
Reference in New Issue
Block a user