Files
wild-cloud/bin/wild-app-restore
2025-08-23 05:46:48 -07:00

602 lines
19 KiB
Bash
Executable File

#!/usr/bin/env bash
set -Eeuo pipefail
# wild-app-restore - Generic restore script for wild-cloud apps
# Usage: wild-app-restore <app-name> [snapshot-id] [--db-only|--pvc-only] [--skip-globals]
# --- Initialize Wild Cloud environment ---------------------------------------
if [ -z "${WC_ROOT:-}" ]; then
echo "WC_ROOT is not set." >&2
exit 1
else
source "${WC_ROOT}/scripts/common.sh"
init_wild_env
fi
# --- Configuration ------------------------------------------------------------
get_staging_dir() {
if wild-config cloud.backup.staging --check; then
wild-config cloud.backup.staging
else
echo "Staging directory is not set. Configure 'cloud.backup.staging' in config.yaml." >&2
exit 1
fi
}
get_restic_config() {
if wild-config cloud.backup.root --check; then
export RESTIC_REPOSITORY="$(wild-config cloud.backup.root)"
else
echo "WARNING: Could not get cloud backup root." >&2
exit 1
fi
if wild-secret cloud.backupPassword --check; then
export RESTIC_PASSWORD="$(wild-secret cloud.backupPassword)"
else
echo "WARNING: Could not get cloud backup secret." >&2
exit 1
fi
}
# --- Helpers ------------------------------------------------------------------
require_k8s() {
if ! command -v kubectl >/dev/null 2>&1; then
echo "kubectl not found." >&2
exit 1
fi
}
require_yq() {
if ! command -v yq >/dev/null 2>&1; then
echo "yq not found. Required for parsing manifest.yaml files." >&2
exit 1
fi
}
require_restic() {
if ! command -v restic >/dev/null 2>&1; then
echo "restic not found. Required for snapshot operations." >&2
exit 1
fi
}
show_help() {
echo "Usage: $0 <app-name> [snapshot-id] [OPTIONS]"
echo "Restore application data from restic snapshots"
echo ""
echo "Arguments:"
echo " app-name Name of the application to restore"
echo " snapshot-id Specific snapshot ID to restore (optional, uses latest if not provided)"
echo ""
echo "Options:"
echo " --db-only Restore only database data"
echo " --pvc-only Restore only PVC data"
echo " --skip-globals Skip restoring database globals (roles, permissions)"
echo " --list List available snapshots for the app"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 discourse # Restore latest discourse snapshot (all data)"
echo " $0 discourse abc123 --db-only # Restore specific snapshot, database only"
echo " $0 discourse --list # List available discourse snapshots"
}
# --- App Discovery Functions (from wild-app-backup) --------------------------
discover_database_deps() {
local app_name="$1"
local manifest_file="${WC_HOME}/apps/${app_name}/manifest.yaml"
if [[ -f "$manifest_file" ]]; then
yq eval '.requires[].name' "$manifest_file" 2>/dev/null | grep -E '^(postgres|mysql|redis)$' || true
fi
}
discover_app_pvcs() {
local app_name="$1"
kubectl get pvc -n "$app_name" -l "app=$app_name" --no-headers -o custom-columns=":metadata.name" 2>/dev/null || true
}
get_app_pods() {
local app_name="$1"
kubectl get pods -n "$app_name" -l "app=$app_name" \
-o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' 2>/dev/null | \
tr ' ' '\n' | head -1 || true
}
# --- Restic Snapshot Functions -----------------------------------------------
list_app_snapshots() {
local app_name="$1"
echo "Available snapshots for app '$app_name':"
restic snapshots --tag "$app_name" --json | jq -r '.[] | "\(.short_id) \(.time) \(.hostname) \(.paths | join(" "))"' | \
sort -k2 -r | head -20
}
get_latest_snapshot() {
local app_name="$1"
restic snapshots --tag "$app_name" --json | jq -r '.[0].short_id' 2>/dev/null || echo ""
}
restore_from_snapshot() {
local app_name="$1"
local snapshot_id="$2"
local staging_dir="$3"
local restore_dir="$staging_dir/restore/$app_name"
mkdir -p "$restore_dir"
echo "Restoring snapshot $snapshot_id to $restore_dir..."
if ! restic restore "$snapshot_id" --target "$restore_dir"; then
echo "Failed to restore snapshot $snapshot_id" >&2
return 1
fi
echo "$restore_dir"
}
# --- Database Restore Functions ----------------------------------------------
restore_postgres_database() {
local app_name="$1"
local restore_dir="$2"
local skip_globals="$3"
local pg_ns="postgres"
local pg_deploy="postgres-deployment"
local db_superuser="postgres"
local db_name="$app_name"
local db_role="$app_name"
echo "Restoring PostgreSQL database '$db_name'..."
# Check if postgres is available
if ! kubectl get pods -n "$pg_ns" >/dev/null 2>&1; then
echo "PostgreSQL namespace '$pg_ns' not accessible. Cannot restore database." >&2
return 1
fi
# Find database dump file
local db_dump
db_dump=$(find "$restore_dir" -name "database_*.dump" -o -name "*_db_*.dump" | head -1)
if [[ -z "$db_dump" ]]; then
echo "No database dump found for '$app_name'" >&2
return 1
fi
# Find globals file
local globals_file
globals_file=$(find "$restore_dir" -name "globals_*.sql" | head -1)
# Helper functions for postgres operations
pg_exec() {
kubectl exec -n "$pg_ns" deploy/"$pg_deploy" -- bash -lc "$*"
}
pg_exec_i() {
kubectl exec -i -n "$pg_ns" deploy/"$pg_deploy" -- bash -lc "$*"
}
# Restore globals first if available and not skipped
if [[ "$skip_globals" != "true" && -n "$globals_file" && -f "$globals_file" ]]; then
echo "Restoring database globals..."
pg_exec_i "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres" < "$globals_file"
fi
# Ensure role exists
pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"
DO \$\$
BEGIN
IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='${db_role}') THEN
CREATE ROLE ${db_role} LOGIN;
END IF;
END
\$\$;\""
# Terminate existing connections
pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname='${db_name}' AND pid <> pg_backend_pid();\""
# Drop and recreate database
pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"
DROP DATABASE IF EXISTS ${db_name};
CREATE DATABASE ${db_name} OWNER ${db_role};\""
# Restore database from dump
echo "Restoring database from $db_dump..."
if ! pg_exec_i "pg_restore -v -j 4 -U ${db_superuser} --clean --if-exists --no-owner --role=${db_role} -d ${db_name}" < "$db_dump"; then
echo "Database restore failed for '$app_name'" >&2
return 1
fi
# Ensure proper ownership
pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"ALTER DATABASE ${db_name} OWNER TO ${db_role};\""
echo "Database restore completed for '$app_name'"
}
restore_mysql_database() {
local app_name="$1"
local restore_dir="$2"
local mysql_ns="mysql"
local mysql_deploy="mysql-deployment"
local mysql_user="root"
local db_name="$app_name"
echo "Restoring MySQL database '$db_name'..."
if ! kubectl get pods -n "$mysql_ns" >/dev/null 2>&1; then
echo "MySQL namespace '$mysql_ns' not accessible. Cannot restore database." >&2
return 1
fi
# Find database dump file
local db_dump
db_dump=$(find "$restore_dir" -name "database_*.sql" -o -name "*_db_*.sql" | head -1)
if [[ -z "$db_dump" ]]; then
echo "No database dump found for '$app_name'" >&2
return 1
fi
# Get MySQL root password from secret
local mysql_password
if ! mysql_password=$(kubectl get secret -n "$mysql_ns" mysql-secret -o jsonpath='{.data.password}' 2>/dev/null | base64 -d); then
echo "Could not retrieve MySQL password. Cannot restore database." >&2
return 1
fi
# Drop and recreate database
kubectl exec -n "$mysql_ns" deploy/"$mysql_deploy" -- bash -c \
"mysql -u${mysql_user} -p'${mysql_password}' -e 'DROP DATABASE IF EXISTS ${db_name}; CREATE DATABASE ${db_name};'"
# Restore database from dump
echo "Restoring database from $db_dump..."
if ! kubectl exec -i -n "$mysql_ns" deploy/"$mysql_deploy" -- bash -c \
"mysql -u${mysql_user} -p'${mysql_password}' ${db_name}" < "$db_dump"; then
echo "Database restore failed for '$app_name'" >&2
return 1
fi
echo "Database restore completed for '$app_name'"
}
# --- PVC Restore Functions ---------------------------------------------------
scale_app() {
local app_name="$1"
local replicas="$2"
echo "Scaling app '$app_name' to $replicas replicas..."
# Find deployments for this app and scale them
local deployments
deployments=$(kubectl get deploy -n "$app_name" -l "app=$app_name" -o name 2>/dev/null || true)
if [[ -z "$deployments" ]]; then
echo "No deployments found for app '$app_name'" >&2
return 1
fi
for deploy in $deployments; do
kubectl scale "$deploy" -n "$app_name" --replicas="$replicas"
if [[ "$replicas" -gt 0 ]]; then
kubectl rollout status "$deploy" -n "$app_name"
fi
done
}
restore_app_pvc() {
local app_name="$1"
local pvc_name="$2"
local restore_dir="$3"
echo "Restoring PVC '$pvc_name' for app '$app_name'..."
# Find the PVC backup directory in the restore directory
local pvc_backup_dir
pvc_backup_dir=$(find "$restore_dir" -type d -name "$pvc_name" | head -1)
if [[ -z "$pvc_backup_dir" || ! -d "$pvc_backup_dir" ]]; then
echo "No backup directory found for PVC '$pvc_name'" >&2
return 1
fi
# Get the Longhorn volume name for this PVC
local pv_name
pv_name=$(kubectl get pvc -n "$app_name" "$pvc_name" -o jsonpath='{.spec.volumeName}')
if [[ -z "$pv_name" ]]; then
echo "Could not find PersistentVolume for PVC '$pvc_name'" >&2
return 1
fi
local longhorn_volume
longhorn_volume=$(kubectl get pv "$pv_name" -o jsonpath='{.spec.csi.volumeHandle}' 2>/dev/null)
if [[ -z "$longhorn_volume" ]]; then
echo "Could not find Longhorn volume for PV '$pv_name'" >&2
return 1
fi
# Create safety snapshot before destructive restore
local safety_snapshot="restore-safety-$(date +%s)"
echo "Creating safety snapshot '$safety_snapshot' for volume '$longhorn_volume'..."
kubectl apply -f - <<EOF
apiVersion: longhorn.io/v1beta2
kind: Snapshot
metadata:
name: $safety_snapshot
namespace: longhorn-system
labels:
app: wild-app-restore
volume: $longhorn_volume
pvc: $pvc_name
original-app: $app_name
spec:
volume: $longhorn_volume
EOF
# Wait for snapshot to be ready
echo "Waiting for safety snapshot to be ready..."
local snapshot_timeout=60
local elapsed=0
while [[ $elapsed -lt $snapshot_timeout ]]; do
local snapshot_ready
snapshot_ready=$(kubectl get snapshot.longhorn.io -n longhorn-system "$safety_snapshot" -o jsonpath='{.status.readyToUse}' 2>/dev/null || echo "false")
if [[ "$snapshot_ready" == "true" ]]; then
echo "Safety snapshot created successfully"
break
fi
sleep 2
elapsed=$((elapsed + 2))
done
if [[ $elapsed -ge $snapshot_timeout ]]; then
echo "Warning: Safety snapshot may not be ready, but proceeding with restore..."
fi
# Scale app down to avoid conflicts during restore
scale_app "$app_name" 0
# Wait for pods to terminate and PVC to be unmounted
echo "Waiting for pods to terminate and PVC to be released..."
sleep 10
# Get PVC details for node affinity
local pv_name
pv_name=$(kubectl get pvc -n "$app_name" "$pvc_name" -o jsonpath='{.spec.volumeName}')
if [[ -z "$pv_name" ]]; then
echo "Could not find PersistentVolume for PVC '$pvc_name'" >&2
return 1
fi
# Get the node where this Longhorn volume is available
local target_node
target_node=$(kubectl get pv "$pv_name" -o jsonpath='{.metadata.annotations.volume\.kubernetes\.io/selected-node}' 2>/dev/null || \
kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1)
echo "Creating restore utility pod on node: $target_node"
# Create temporary pod with node affinity and PVC mounted
local temp_pod="restore-util-$(date +%s)"
kubectl apply -n "$app_name" -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: $temp_pod
labels:
app: restore-utility
spec:
nodeSelector:
kubernetes.io/hostname: $target_node
containers:
- name: restore-util
image: alpine:latest
command: ["/bin/sh", "-c", "sleep 3600"]
volumeMounts:
- name: data
mountPath: /restore-target
securityContext:
runAsUser: 0
fsGroup: 0
volumes:
- name: data
persistentVolumeClaim:
claimName: $pvc_name
restartPolicy: Never
tolerations:
- operator: Exists
EOF
# Wait for pod to be ready with longer timeout
echo "Waiting for restore utility pod to be ready..."
if ! kubectl wait --for=condition=Ready pod/"$temp_pod" -n "$app_name" --timeout=120s; then
echo "Restore utility pod failed to start. Checking status..."
kubectl describe pod -n "$app_name" "$temp_pod"
kubectl delete pod -n "$app_name" "$temp_pod" --force --grace-period=0 || true
echo "ERROR: Restore failed. Safety snapshot '$safety_snapshot' has been preserved for manual recovery." >&2
echo "To recover from safety snapshot, use: kubectl get snapshot.longhorn.io -n longhorn-system $safety_snapshot" >&2
return 1
fi
echo "Clearing existing PVC data..."
kubectl exec -n "$app_name" "$temp_pod" -- sh -c "rm -rf /restore-target/* /restore-target/.*" 2>/dev/null || true
echo "Copying backup data to PVC..."
# Use tar to stream data into the pod, preserving permissions
if ! tar -C "$pvc_backup_dir" -cf - . | kubectl exec -i -n "$app_name" "$temp_pod" -- tar -C /restore-target -xf -; then
echo "Failed to copy data to PVC. Cleaning up..." >&2
kubectl delete pod -n "$app_name" "$temp_pod" --force --grace-period=0 || true
echo "ERROR: Restore failed. Safety snapshot '$safety_snapshot' has been preserved for manual recovery." >&2
echo "To recover from safety snapshot, use: kubectl get snapshot.longhorn.io -n longhorn-system $safety_snapshot" >&2
return 1
fi
echo "Verifying restored data..."
kubectl exec -n "$app_name" "$temp_pod" -- sh -c "ls -la /restore-target | head -10"
# Clean up temporary pod
kubectl delete pod -n "$app_name" "$temp_pod"
# Scale app back up
scale_app "$app_name" 1
# Clean up safety snapshot if restore was successful
echo "Cleaning up safety snapshot '$safety_snapshot'..."
if kubectl delete snapshot.longhorn.io -n longhorn-system "$safety_snapshot" 2>/dev/null; then
echo "Safety snapshot cleaned up successfully"
else
echo "Warning: Could not clean up safety snapshot '$safety_snapshot'. You may need to delete it manually."
fi
echo "PVC '$pvc_name' restore completed successfully"
}
# --- Main Restore Function ---------------------------------------------------
restore_app() {
local app_name="$1"
local snapshot_id="$2"
local mode="$3"
local skip_globals="$4"
local staging_dir="$5"
echo "=========================================="
echo "Starting restore of app: $app_name"
echo "Snapshot: $snapshot_id"
echo "Mode: $mode"
echo "=========================================="
# Restore snapshot to staging directory
local restore_dir
restore_dir=$(restore_from_snapshot "$app_name" "$snapshot_id" "$staging_dir")
if [[ ! -d "$restore_dir" ]]; then
echo "Failed to restore snapshot for '$app_name'" >&2
return 1
fi
# Discover what components this app has
local database_deps
database_deps=$(discover_database_deps "$app_name")
local pvcs
pvcs=$(discover_app_pvcs "$app_name")
# Restore database components
if [[ "$mode" == "all" || "$mode" == "db" ]]; then
for db_type in $database_deps; do
case "$db_type" in
postgres)
restore_postgres_database "$app_name" "$restore_dir" "$skip_globals"
;;
mysql)
restore_mysql_database "$app_name" "$restore_dir"
;;
redis)
echo "Redis restore not implemented yet. Skipping."
;;
esac
done
fi
# Restore PVC components
if [[ "$mode" == "all" || "$mode" == "pvc" ]]; then
for pvc in $pvcs; do
restore_app_pvc "$app_name" "$pvc" "$restore_dir"
done
fi
# Clean up restore directory
rm -rf "$restore_dir"
echo "=========================================="
echo "Restore completed for app: $app_name"
echo "=========================================="
}
# --- Main Script Logic -------------------------------------------------------
main() {
require_k8s
require_yq
require_restic
get_restic_config
local staging_dir
staging_dir=$(get_staging_dir)
mkdir -p "$staging_dir/restore"
# Parse arguments
if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
show_help
exit 0
fi
local app_name="$1"
shift
local snapshot_id=""
local mode="all"
local skip_globals="false"
local list_snapshots="false"
# Parse remaining arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--db-only)
mode="db"
shift
;;
--pvc-only)
mode="pvc"
shift
;;
--skip-globals)
skip_globals="true"
shift
;;
--list)
list_snapshots="true"
shift
;;
-h|--help)
show_help
exit 0
;;
*)
if [[ -z "$snapshot_id" ]]; then
snapshot_id="$1"
else
echo "Unknown option: $1" >&2
show_help
exit 1
fi
shift
;;
esac
done
# List snapshots if requested
if [[ "$list_snapshots" == "true" ]]; then
list_app_snapshots "$app_name"
exit 0
fi
# Get latest snapshot if none specified
if [[ -z "$snapshot_id" ]]; then
snapshot_id=$(get_latest_snapshot "$app_name")
if [[ -z "$snapshot_id" ]]; then
echo "No snapshots found for app '$app_name'" >&2
exit 1
fi
echo "Using latest snapshot: $snapshot_id"
fi
# Perform the restore
restore_app "$app_name" "$snapshot_id" "$mode" "$skip_globals" "$staging_dir"
echo "Restore operation completed successfully."
}
main "$@"