wild-cloud/bin/wild-app-restore

#!/usr/bin/env bash
set -Eeuo pipefail

# wild-app-restore - Generic restore script for wild-cloud apps
# Usage: wild-app-restore <app-name> [snapshot-id] [--db-only|--pvc-only] [--skip-globals]

# --- Initialize Wild Cloud environment ---------------------------------------
if [ -z "${WC_ROOT:-}" ]; then
    echo "WC_ROOT is not set." >&2
    exit 1
else
    source "${WC_ROOT}/scripts/common.sh"
    init_wild_env
fi

# --- Configuration ------------------------------------------------------------
get_staging_dir() {
    if wild-config cloud.backup.staging --check; then
        wild-config cloud.backup.staging
    else
        echo "Staging directory is not set. Configure 'cloud.backup.staging' in config.yaml." >&2
        exit 1
    fi
}

get_restic_config() {
    if wild-config cloud.backup.root --check; then
        export RESTIC_REPOSITORY="$(wild-config cloud.backup.root)"
    else
        echo "WARNING: Could not get cloud backup root." >&2
        exit 1
    fi

    if wild-secret cloud.backupPassword --check; then
        export RESTIC_PASSWORD="$(wild-secret cloud.backupPassword)"
    else
        echo "WARNING: Could not get cloud backup secret." >&2
        exit 1
    fi
}

# --- Helpers ------------------------------------------------------------------
require_k8s() {
    if ! command -v kubectl >/dev/null 2>&1; then
        echo "kubectl not found." >&2
        exit 1
    fi
}

require_yq() {
    if ! command -v yq >/dev/null 2>&1; then
        echo "yq not found. Required for parsing manifest.yaml files." >&2
        exit 1
    fi
}

require_restic() {
    if ! command -v restic >/dev/null 2>&1; then
        echo "restic not found. Required for snapshot operations." >&2
        exit 1
    fi
}

show_help() {
    echo "Usage: $0 <app-name> [snapshot-id] [OPTIONS]"
    echo "Restore application data from restic snapshots"
    echo ""
    echo "Arguments:"
    echo "  app-name      Name of the application to restore"
    echo "  snapshot-id   Specific snapshot ID to restore (optional, uses latest if not provided)"
    echo ""
    echo "Options:"
    echo "  --db-only     Restore only database data"
    echo "  --pvc-only    Restore only PVC data"
    echo "  --skip-globals Skip restoring database globals (roles, permissions)"
    echo "  --list        List available snapshots for the app"
    echo "  -h, --help    Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 discourse                    # Restore latest discourse snapshot (all data)"
    echo "  $0 discourse abc123 --db-only  # Restore specific snapshot, database only"
    echo "  $0 discourse --list            # List available discourse snapshots"
}

# --- App Discovery Functions (from wild-app-backup) --------------------------
discover_database_deps() {
    local app_name="$1"
    local manifest_file="${WC_HOME}/apps/${app_name}/manifest.yaml"

    if [[ -f "$manifest_file" ]]; then
        yq eval '.requires[].name' "$manifest_file" 2>/dev/null | grep -E '^(postgres|mysql|redis)$' || true
    fi
}

discover_app_pvcs() {
    local app_name="$1"
    kubectl get pvc -n "$app_name" -l "app=$app_name" --no-headers -o custom-columns=":metadata.name" 2>/dev/null || true
}

get_app_pods() {
    local app_name="$1"
    kubectl get pods -n "$app_name" -l "app=$app_name" \
        -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' 2>/dev/null | \
        tr ' ' '\n' | head -1 || true
}

# --- Restic Snapshot Functions -----------------------------------------------
list_app_snapshots() {
    local app_name="$1"
    echo "Available snapshots for app '$app_name':"
    restic snapshots --tag "$app_name" --json | jq -r '.[] | "\(.short_id)  \(.time)  \(.hostname)  \(.paths | join(" "))"' | \
        sort -k2 -r | head -20
}

get_latest_snapshot() {
    local app_name="$1"
    restic snapshots --tag "$app_name" --json | jq -r '.[0].short_id' 2>/dev/null || echo ""
}

restore_from_snapshot() {
    local app_name="$1"
    local snapshot_id="$2"
    local staging_dir="$3"

    local restore_dir="$staging_dir/restore/$app_name"
    mkdir -p "$restore_dir"

    echo "Restoring snapshot $snapshot_id to $restore_dir..."
    if ! restic restore "$snapshot_id" --target "$restore_dir"; then
        echo "Failed to restore snapshot $snapshot_id" >&2
        return 1
    fi

    echo "$restore_dir"
}

# --- Database Restore Functions ----------------------------------------------
restore_postgres_database() {
    local app_name="$1"
    local restore_dir="$2"
    local skip_globals="$3"

    local pg_ns="postgres"
    local pg_deploy="postgres-deployment"
    local db_superuser="postgres"
    local db_name="$app_name"
    local db_role="$app_name"

    echo "Restoring PostgreSQL database '$db_name'..."

    # Check if postgres is available
    if ! kubectl get pods -n "$pg_ns" >/dev/null 2>&1; then
        echo "PostgreSQL namespace '$pg_ns' not accessible. Cannot restore database." >&2
        return 1
    fi

    # Find database dump file
    local db_dump
    db_dump=$(find "$restore_dir" -name "database_*.dump" -o -name "*_db_*.dump" | head -1)
    if [[ -z "$db_dump" ]]; then
        echo "No database dump found for '$app_name'" >&2
        return 1
    fi

    # Find globals file
    local globals_file
    globals_file=$(find "$restore_dir" -name "globals_*.sql" | head -1)

    # Helper functions for postgres operations
    pg_exec() {
        kubectl exec -n "$pg_ns" deploy/"$pg_deploy" -- bash -lc "$*"
    }

    pg_exec_i() {
        kubectl exec -i -n "$pg_ns" deploy/"$pg_deploy" -- bash -lc "$*"
    }

    # Restore globals first if available and not skipped
    if [[ "$skip_globals" != "true" && -n "$globals_file" && -f "$globals_file" ]]; then
        echo "Restoring database globals..."
        pg_exec_i "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres" < "$globals_file"
    fi

    # Ensure role exists
    pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"
        DO \$\$
        BEGIN
            IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='${db_role}') THEN
                CREATE ROLE ${db_role} LOGIN;
            END IF;
        END
        \$\$;\""

    # Terminate existing connections
    pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"
        SELECT pg_terminate_backend(pid)
        FROM pg_stat_activity
        WHERE datname='${db_name}' AND pid <> pg_backend_pid();\""

    # Drop and recreate database
    pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"
        DROP DATABASE IF EXISTS ${db_name};
        CREATE DATABASE ${db_name} OWNER ${db_role};\""

    # Restore database from dump
    echo "Restoring database from $db_dump..."
    if ! pg_exec_i "pg_restore -v -j 4 -U ${db_superuser} --clean --if-exists --no-owner --role=${db_role} -d ${db_name}" < "$db_dump"; then
        echo "Database restore failed for '$app_name'" >&2
        return 1
    fi

    # Ensure proper ownership
    pg_exec "psql -v ON_ERROR_STOP=1 -U ${db_superuser} -d postgres -c \"ALTER DATABASE ${db_name} OWNER TO ${db_role};\""

    echo "Database restore completed for '$app_name'"
}

restore_mysql_database() {
    local app_name="$1"
    local restore_dir="$2"

    local mysql_ns="mysql"
    local mysql_deploy="mysql-deployment"
    local mysql_user="root"
    local db_name="$app_name"

    echo "Restoring MySQL database '$db_name'..."

    if ! kubectl get pods -n "$mysql_ns" >/dev/null 2>&1; then
        echo "MySQL namespace '$mysql_ns' not accessible. Cannot restore database." >&2
        return 1
    fi

    # Find database dump file
    local db_dump
    db_dump=$(find "$restore_dir" -name "database_*.sql" -o -name "*_db_*.sql" | head -1)
    if [[ -z "$db_dump" ]]; then
        echo "No database dump found for '$app_name'" >&2
        return 1
    fi

    # Get MySQL root password from secret
    local mysql_password
    if ! mysql_password=$(kubectl get secret -n "$mysql_ns" mysql-secret -o jsonpath='{.data.password}' 2>/dev/null | base64 -d); then
        echo "Could not retrieve MySQL password. Cannot restore database." >&2
        return 1
    fi

    # Drop and recreate database
    kubectl exec -n "$mysql_ns" deploy/"$mysql_deploy" -- bash -c \
        "mysql -u${mysql_user} -p'${mysql_password}' -e 'DROP DATABASE IF EXISTS ${db_name}; CREATE DATABASE ${db_name};'"

    # Restore database from dump
    echo "Restoring database from $db_dump..."
    if ! kubectl exec -i -n "$mysql_ns" deploy/"$mysql_deploy" -- bash -c \
        "mysql -u${mysql_user} -p'${mysql_password}' ${db_name}" < "$db_dump"; then
        echo "Database restore failed for '$app_name'" >&2
        return 1
    fi

    echo "Database restore completed for '$app_name'"
}

# --- PVC Restore Functions ---------------------------------------------------
scale_app() {
    local app_name="$1"
    local replicas="$2"

    echo "Scaling app '$app_name' to $replicas replicas..."

    # Find deployments for this app and scale them
    local deployments
    deployments=$(kubectl get deploy -n "$app_name" -l "app=$app_name" -o name 2>/dev/null || true)

    if [[ -z "$deployments" ]]; then
        echo "No deployments found for app '$app_name'" >&2
        return 1
    fi

    for deploy in $deployments; do
        kubectl scale "$deploy" -n "$app_name" --replicas="$replicas"
        if [[ "$replicas" -gt 0 ]]; then
            kubectl rollout status "$deploy" -n "$app_name"
        fi
    done
}

restore_app_pvc() {
    local app_name="$1"
    local pvc_name="$2"
    local restore_dir="$3"

    echo "Restoring PVC '$pvc_name' for app '$app_name'..."

    # Find the PVC backup directory in the restore directory
    local pvc_backup_dir
    pvc_backup_dir=$(find "$restore_dir" -type d -name "$pvc_name" | head -1)

    if [[ -z "$pvc_backup_dir" || ! -d "$pvc_backup_dir" ]]; then
        echo "No backup directory found for PVC '$pvc_name'" >&2
        return 1
    fi

    # Get the Longhorn volume name for this PVC
    local pv_name
    pv_name=$(kubectl get pvc -n "$app_name" "$pvc_name" -o jsonpath='{.spec.volumeName}')
    if [[ -z "$pv_name" ]]; then
        echo "Could not find PersistentVolume for PVC '$pvc_name'" >&2
        return 1
    fi

    local longhorn_volume
    longhorn_volume=$(kubectl get pv "$pv_name" -o jsonpath='{.spec.csi.volumeHandle}' 2>/dev/null)
    if [[ -z "$longhorn_volume" ]]; then
        echo "Could not find Longhorn volume for PV '$pv_name'" >&2
        return 1
    fi

    # Create safety snapshot before destructive restore
    local safety_snapshot="restore-safety-$(date +%s)"
    echo "Creating safety snapshot '$safety_snapshot' for volume '$longhorn_volume'..."

    kubectl apply -f - <<EOF
apiVersion: longhorn.io/v1beta2
kind: Snapshot
metadata:
  name: $safety_snapshot
  namespace: longhorn-system
  labels:
    app: wild-app-restore
    volume: $longhorn_volume
    pvc: $pvc_name
    original-app: $app_name
spec:
  volume: $longhorn_volume
EOF

    # Wait for snapshot to be ready
    echo "Waiting for safety snapshot to be ready..."
    local snapshot_timeout=60
    local elapsed=0
    while [[ $elapsed -lt $snapshot_timeout ]]; do
        local snapshot_ready
        snapshot_ready=$(kubectl get snapshot.longhorn.io -n longhorn-system "$safety_snapshot" -o jsonpath='{.status.readyToUse}' 2>/dev/null || echo "false")

        if [[ "$snapshot_ready" == "true" ]]; then
            echo "Safety snapshot created successfully"
            break
        fi

        sleep 2
        elapsed=$((elapsed + 2))
    done

    if [[ $elapsed -ge $snapshot_timeout ]]; then
        echo "Warning: Safety snapshot may not be ready, but proceeding with restore..."
    fi

    # Scale app down to avoid conflicts during restore
    scale_app "$app_name" 0

    # Wait for pods to terminate and PVC to be unmounted
    echo "Waiting for pods to terminate and PVC to be released..."
    sleep 10

    # Get PVC details for node affinity
    local pv_name
    pv_name=$(kubectl get pvc -n "$app_name" "$pvc_name" -o jsonpath='{.spec.volumeName}')
    if [[ -z "$pv_name" ]]; then
        echo "Could not find PersistentVolume for PVC '$pvc_name'" >&2
        return 1
    fi

    # Get the node where this Longhorn volume is available
    local target_node
    target_node=$(kubectl get pv "$pv_name" -o jsonpath='{.metadata.annotations.volume\.kubernetes\.io/selected-node}' 2>/dev/null || \
                  kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1)

    echo "Creating restore utility pod on node: $target_node"

    # Create temporary pod with node affinity and PVC mounted
    local temp_pod="restore-util-$(date +%s)"
    kubectl apply -n "$app_name" -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
  name: $temp_pod
  labels:
    app: restore-utility
spec:
  nodeSelector:
    kubernetes.io/hostname: $target_node
  containers:
  - name: restore-util
    image: alpine:latest
    command: ["/bin/sh", "-c", "sleep 3600"]
    volumeMounts:
    - name: data
      mountPath: /restore-target
    securityContext:
      runAsUser: 0
      fsGroup: 0
  volumes:
  - name: data
    persistentVolumeClaim:
      claimName: $pvc_name
  restartPolicy: Never
  tolerations:
  - operator: Exists
EOF

    # Wait for pod to be ready with longer timeout
    echo "Waiting for restore utility pod to be ready..."
    if ! kubectl wait --for=condition=Ready pod/"$temp_pod" -n "$app_name" --timeout=120s; then
        echo "Restore utility pod failed to start. Checking status..."
        kubectl describe pod -n "$app_name" "$temp_pod"
        kubectl delete pod -n "$app_name" "$temp_pod" --force --grace-period=0 || true
        echo "ERROR: Restore failed. Safety snapshot '$safety_snapshot' has been preserved for manual recovery." >&2
        echo "To recover from safety snapshot, use: kubectl get snapshot.longhorn.io -n longhorn-system $safety_snapshot" >&2
        return 1
    fi

    echo "Clearing existing PVC data..."
    kubectl exec -n "$app_name" "$temp_pod" -- sh -c "rm -rf /restore-target/* /restore-target/.*" 2>/dev/null || true

    echo "Copying backup data to PVC..."
    # Use tar to stream data into the pod, preserving permissions
    if ! tar -C "$pvc_backup_dir" -cf - . | kubectl exec -i -n "$app_name" "$temp_pod" -- tar -C /restore-target -xf -; then
        echo "Failed to copy data to PVC. Cleaning up..." >&2
        kubectl delete pod -n "$app_name" "$temp_pod" --force --grace-period=0 || true
        echo "ERROR: Restore failed. Safety snapshot '$safety_snapshot' has been preserved for manual recovery." >&2
        echo "To recover from safety snapshot, use: kubectl get snapshot.longhorn.io -n longhorn-system $safety_snapshot" >&2
        return 1
    fi

    echo "Verifying restored data..."
    kubectl exec -n "$app_name" "$temp_pod" -- sh -c "ls -la /restore-target | head -10"

    # Clean up temporary pod
    kubectl delete pod -n "$app_name" "$temp_pod"

    # Scale app back up
    scale_app "$app_name" 1

    # Clean up safety snapshot if restore was successful
    echo "Cleaning up safety snapshot '$safety_snapshot'..."
    if kubectl delete snapshot.longhorn.io -n longhorn-system "$safety_snapshot" 2>/dev/null; then
        echo "Safety snapshot cleaned up successfully"
    else
        echo "Warning: Could not clean up safety snapshot '$safety_snapshot'. You may need to delete it manually."
    fi

    echo "PVC '$pvc_name' restore completed successfully"
}

# --- Main Restore Function ---------------------------------------------------
restore_app() {
    local app_name="$1"
    local snapshot_id="$2"
    local mode="$3"
    local skip_globals="$4"
    local staging_dir="$5"

    echo "=========================================="
    echo "Starting restore of app: $app_name"
    echo "Snapshot: $snapshot_id"
    echo "Mode: $mode"
    echo "=========================================="

    # Restore snapshot to staging directory
    local restore_dir
    restore_dir=$(restore_from_snapshot "$app_name" "$snapshot_id" "$staging_dir")

    if [[ ! -d "$restore_dir" ]]; then
        echo "Failed to restore snapshot for '$app_name'" >&2
        return 1
    fi

    # Discover what components this app has
    local database_deps
    database_deps=$(discover_database_deps "$app_name")

    local pvcs
    pvcs=$(discover_app_pvcs "$app_name")

    # Restore database components
    if [[ "$mode" == "all" || "$mode" == "db" ]]; then
        for db_type in $database_deps; do
            case "$db_type" in
                postgres)
                    restore_postgres_database "$app_name" "$restore_dir" "$skip_globals"
                    ;;
                mysql)
                    restore_mysql_database "$app_name" "$restore_dir"
                    ;;
                redis)
                    echo "Redis restore not implemented yet. Skipping."
                    ;;
            esac
        done
    fi

    # Restore PVC components
    if [[ "$mode" == "all" || "$mode" == "pvc" ]]; then
        for pvc in $pvcs; do
            restore_app_pvc "$app_name" "$pvc" "$restore_dir"
        done
    fi

    # Clean up restore directory
    rm -rf "$restore_dir"

    echo "=========================================="
    echo "Restore completed for app: $app_name"
    echo "=========================================="
}

# --- Main Script Logic -------------------------------------------------------
main() {
    require_k8s
    require_yq
    require_restic

    get_restic_config

    local staging_dir
    staging_dir=$(get_staging_dir)
    mkdir -p "$staging_dir/restore"

    # Parse arguments
    if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
        show_help
        exit 0
    fi

    local app_name="$1"
    shift

    local snapshot_id=""
    local mode="all"
    local skip_globals="false"
    local list_snapshots="false"

    # Parse remaining arguments
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --db-only)
                mode="db"
                shift
                ;;
            --pvc-only)
                mode="pvc"
                shift
                ;;
            --skip-globals)
                skip_globals="true"
                shift
                ;;
            --list)
                list_snapshots="true"
                shift
                ;;
            -h|--help)
                show_help
                exit 0
                ;;
            *)
                if [[ -z "$snapshot_id" ]]; then
                    snapshot_id="$1"
                else
                    echo "Unknown option: $1" >&2
                    show_help
                    exit 1
                fi
                shift
                ;;
        esac
    done

    # List snapshots if requested
    if [[ "$list_snapshots" == "true" ]]; then
        list_app_snapshots "$app_name"
        exit 0
    fi

    # Get latest snapshot if none specified
    if [[ -z "$snapshot_id" ]]; then
        snapshot_id=$(get_latest_snapshot "$app_name")
        if [[ -z "$snapshot_id" ]]; then
            echo "No snapshots found for app '$app_name'" >&2
            exit 1
        fi
        echo "Using latest snapshot: $snapshot_id"
    fi

    # Perform the restore
    restore_app "$app_name" "$snapshot_id" "$mode" "$skip_globals" "$staging_dir"

    echo "Restore operation completed successfully."
}

main "$@"