4 Commits

Author SHA1 Message Date
Paul Payne
61460b63a3 Adds more to wild-backup. 2025-08-31 15:02:35 -07:00
Paul Payne
9f302e0e29 Update backup/restore docs. 2025-08-31 15:00:00 -07:00
Paul Payne
f83763b070 Fix immich secret key ref. 2025-08-31 14:49:35 -07:00
Paul Payne
c9ab5a31d8 Discourse app fixes. 2025-08-31 14:46:42 -07:00
9 changed files with 761 additions and 51 deletions

View File

@@ -11,33 +11,21 @@ data:
DISCOURSE_SITE_NAME: "{{ .apps.discourse.siteName }}"
DISCOURSE_USERNAME: "{{ .apps.discourse.adminUsername }}"
DISCOURSE_EMAIL: "{{ .apps.discourse.adminEmail }}"
DISCOURSE_REDIS_HOST: "{{ .apps.discourse.redisHostname }}"
DISCOURSE_REDIS_PORT_NUMBER: "6379"
DISCOURSE_DATABASE_HOST: "{{ .apps.discourse.dbHostname }}"
DISCOURSE_DATABASE_PORT_NUMBER: "5432"
DISCOURSE_DATABASE_NAME: "{{ .apps.discourse.dbName }}"
DISCOURSE_DATABASE_USER: "{{ .apps.discourse.dbUsername }}"
# DISCOURSE_SMTP_ADDRESS: "{{ .apps.discourse.smtp.host }}"
# DISCOURSE_SMTP_PORT: "{{ .apps.discourse.smtp.port }}"
# DISCOURSE_SMTP_USER_NAME: "{{ .apps.discourse.smtp.user }}"
# DISCOURSE_SMTP_ENABLE_START_TLS: "{{ .apps.discourse.smtp.startTls }}"
# DISCOURSE_SMTP_AUTHENTICATION: "login"
# Bitnami specific environment variables (diverges from the original)
# https://techdocs.broadcom.com/us/en/vmware-tanzu/bitnami-secure-images/bitnami-secure-images/services/bsi-app-doc/apps-containers-discourse-index.html
DISCOURSE_SMTP_HOST: "{{ .apps.discourse.smtp.host }}"
DISCOURSE_SMTP_PORT_NUMBER: "{{ .apps.discourse.smtp.port }}"
DISCOURSE_SMTP_PORT: "{{ .apps.discourse.smtp.port }}"
DISCOURSE_SMTP_USER: "{{ .apps.discourse.smtp.user }}"
DISCOURSE_SMTP_ENABLE_START_TLS: "{{ .apps.discourse.smtp.startTls }}"
DISCOURSE_SMTP_AUTH: "login"
DISCOURSE_SMTP_PROTOCOL: "tls"
DISCOURSE_SMTP_AUTH: "login"
DISCOURSE_PRECOMPILE_ASSETS: "false"
# SMTP_HOST: "{{ .apps.discourse.smtp.host }}"
# SMTP_PORT: "{{ .apps.discourse.smtp.port }}"
# SMTP_USER_NAME: "{{ .apps.discourse.smtp.user }}"
# SMTP_TLS: "{{ .apps.discourse.smtp.tls }}"
# SMTP_ENABLE_START_TLS: "{{ .apps.discourse.smtp.startTls }}"
# SMTP_AUTHENTICATION: "login"
# DISCOURSE_PRECOMPILE_ASSETS: "false"
# DISCOURSE_SKIP_INSTALL: "no"
# DISCOURSE_SKIP_BOOTSTRAP: "yes"

View File

@@ -37,7 +37,7 @@ spec:
initContainers:
containers:
- name: discourse
image: { { .apps.discourse.image } }
image: docker.io/bitnami/discourse:3.4.7-debian-12-r0
imagePullPolicy: "IfNotPresent"
securityContext:
allowPrivilegeEscalation: false
@@ -85,7 +85,7 @@ spec:
valueFrom:
secretKeyRef:
name: discourse-secrets
key: apps.discourse.redisPassword
key: apps.redis.password
- name: DISCOURSE_SECRET_KEY_BASE
valueFrom:
secretKeyRef:
@@ -139,7 +139,7 @@ spec:
mountPath: /bitnami/discourse
subPath: discourse
- name: sidekiq
image: { { .apps.discourse.sidekiqImage } }
image: docker.io/bitnami/discourse:3.4.7-debian-12-r0
imagePullPolicy: "IfNotPresent"
securityContext:
allowPrivilegeEscalation: false
@@ -182,7 +182,7 @@ spec:
valueFrom:
secretKeyRef:
name: discourse-secrets
key: apps.discourse.redisPassword
key: apps.redis.password
- name: DISCOURSE_SECRET_KEY_BASE
valueFrom:
secretKeyRef:

View File

@@ -6,8 +6,6 @@ requires:
- name: postgres
- name: redis
defaultConfig:
image: docker.io/bitnami/discourse:3.4.7-debian-12-r0
sidekiqImage: docker.io/bitnami/discourse:3.4.7-debian-12-r0
timezone: UTC
port: 8080
storage: 10Gi
@@ -32,7 +30,7 @@ requiredSecrets:
- apps.discourse.adminPassword
- apps.discourse.dbPassword
- apps.discourse.dbUrl
- apps.discourse.redisPassword
- apps.redis.password
- apps.discourse.secretKeyBase
- apps.discourse.smtpPassword
- apps.postgres.password

View File

@@ -52,7 +52,7 @@ spec:
- name: POSTGRES_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: postgres-secrets
name: immich-secrets
key: apps.postgres.password
- name: DB_HOSTNAME
value: "{{ .apps.immich.dbHostname }}"

View File

@@ -4,6 +4,72 @@
set -e
set -o pipefail
# Parse command line flags
BACKUP_HOME=true
BACKUP_APPS=true
BACKUP_CLUSTER=true
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Backup components of your wild-cloud infrastructure"
echo ""
echo "Options:"
echo " --home-only Backup only WC_HOME (wild-cloud configuration)"
echo " --apps-only Backup only applications (databases and PVCs)"
echo " --cluster-only Backup only Kubernetes cluster resources"
echo " --no-home Skip WC_HOME backup"
echo " --no-apps Skip application backups"
echo " --no-cluster Skip cluster resource backup"
echo " -h, --help Show this help message"
echo ""
echo "Default: Backup all components (home, apps, cluster)"
}
# Process command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--home-only)
BACKUP_HOME=true
BACKUP_APPS=false
BACKUP_CLUSTER=false
shift
;;
--apps-only)
BACKUP_HOME=false
BACKUP_APPS=true
BACKUP_CLUSTER=false
shift
;;
--cluster-only)
BACKUP_HOME=false
BACKUP_APPS=false
BACKUP_CLUSTER=true
shift
;;
--no-home)
BACKUP_HOME=false
shift
;;
--no-apps)
BACKUP_APPS=false
shift
;;
--no-cluster)
BACKUP_CLUSTER=false
shift
;;
-h|--help)
show_help
exit 0
;;
*)
echo "Unknown option: $1"
show_help
exit 1
;;
esac
done
# Initialize Wild Cloud environment
if [ -z "${WC_ROOT}" ]; then
echo "WC_ROOT is not set."
@@ -46,33 +112,134 @@ else
echo "Repository initialized successfully."
fi
# Backup entire WC_HOME.
restic --verbose --tag wild-cloud --tag wc-home --tag "$(date +%Y-%m-%d)" backup $WC_HOME
# TODO: Ignore wild cloud cache?
# Backup entire WC_HOME
if [ "$BACKUP_HOME" = true ]; then
echo "Backing up WC_HOME..."
restic --verbose --tag wild-cloud --tag wc-home --tag "$(date +%Y-%m-%d)" backup $WC_HOME
echo "WC_HOME backup completed."
# TODO: Ignore wild cloud cache?
else
echo "Skipping WC_HOME backup."
fi
mkdir -p "$STAGING_DIR"
# Run backup for all apps at once
echo "Running backup for all apps..."
wild-app-backup --all
if [ "$BACKUP_APPS" = true ]; then
echo "Running backup for all apps..."
wild-app-backup --all
# Upload each app's backup to restic individually
for app_dir in "$STAGING_DIR"/apps/*; do
if [ ! -d "$app_dir" ]; then
continue
# Upload each app's backup to restic individually
for app_dir in "$STAGING_DIR"/apps/*; do
if [ ! -d "$app_dir" ]; then
continue
fi
app="$(basename "$app_dir")"
echo "Uploading backup for app: $app"
restic --verbose --tag wild-cloud --tag "$app" --tag "$(date +%Y-%m-%d)" backup "$app_dir"
echo "Backup for app '$app' completed."
done
else
echo "Skipping application backups."
fi
# --- etcd Backup Function ----------------------------------------------------
backup_etcd() {
local cluster_backup_dir="$1"
local etcd_backup_file="$cluster_backup_dir/etcd-snapshot.db"
echo "Creating etcd snapshot..."
# For Talos, we use talosctl to create etcd snapshots
if command -v talosctl >/dev/null 2>&1; then
# Try to get etcd snapshot via talosctl (works for Talos clusters)
local control_plane_nodes
control_plane_nodes=$(kubectl get nodes -l node-role.kubernetes.io/control-plane -o jsonpath='{.items[*].status.addresses[?(@.type=="InternalIP")].address}' | tr ' ' '\n' | head -1)
if [[ -n "$control_plane_nodes" ]]; then
echo "Using talosctl to backup etcd from control plane node: $control_plane_nodes"
if talosctl --nodes "$control_plane_nodes" etcd snapshot "$etcd_backup_file"; then
echo " etcd backup created: $etcd_backup_file"
return 0
else
echo " talosctl etcd snapshot failed, trying alternative method..."
fi
else
echo " No control plane nodes found for talosctl method"
fi
fi
app="$(basename "$app_dir")"
echo "Uploading backup for app: $app"
restic --verbose --tag wild-cloud --tag "$app" --tag "$(date +%Y-%m-%d)" backup "$app_dir"
echo "Backup for app '$app' completed."
done
# Alternative: Try to backup via etcd pod if available
local etcd_pod
etcd_pod=$(kubectl get pods -n kube-system -l component=etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
if [[ -n "$etcd_pod" ]]; then
echo "Using etcd pod: $etcd_pod"
# Create snapshot using etcdctl inside the etcd pod
if kubectl exec -n kube-system "$etcd_pod" -- etcdctl \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
snapshot save /tmp/etcd-snapshot.db; then
# Copy snapshot out of pod
kubectl cp -n kube-system "$etcd_pod:/tmp/etcd-snapshot.db" "$etcd_backup_file"
# Clean up temporary file in pod
kubectl exec -n kube-system "$etcd_pod" -- rm -f /tmp/etcd-snapshot.db
echo " etcd backup created: $etcd_backup_file"
return 0
else
echo " etcd pod snapshot failed"
fi
else
echo " No etcd pod found in kube-system namespace"
fi
# Final fallback: Try direct etcdctl if available on local system
if command -v etcdctl >/dev/null 2>&1; then
echo "Attempting local etcdctl backup..."
# This would need proper certificates and endpoints configured
echo " Local etcdctl backup not implemented (requires certificate configuration)"
fi
echo " Warning: Could not create etcd backup - no working method found"
echo " Consider installing talosctl or ensuring etcd pods are accessible"
return 1
}
# Back up Kubernetes resources
# kubectl get all -A -o yaml > "$BACKUP_DIR/all-resources.yaml"
# kubectl get secrets -A -o yaml > "$BACKUP_DIR/secrets.yaml"
# kubectl get configmaps -A -o yaml > "$BACKUP_DIR/configmaps.yaml"
# Back up Kubernetes cluster resources
if [ "$BACKUP_CLUSTER" = true ]; then
echo "Backing up Kubernetes cluster resources..."
CLUSTER_BACKUP_DIR="$STAGING_DIR/cluster"
# Clean up any existing cluster backup files
if [[ -d "$CLUSTER_BACKUP_DIR" ]]; then
echo "Cleaning up existing cluster backup files..."
rm -rf "$CLUSTER_BACKUP_DIR"
fi
mkdir -p "$CLUSTER_BACKUP_DIR"
# Back up persistent volumes
# TODO: Add logic to back up persistent volume data
kubectl get all -A -o yaml > "$CLUSTER_BACKUP_DIR/all-resources.yaml"
kubectl get secrets -A -o yaml > "$CLUSTER_BACKUP_DIR/secrets.yaml"
kubectl get configmaps -A -o yaml > "$CLUSTER_BACKUP_DIR/configmaps.yaml"
kubectl get persistentvolumes -o yaml > "$CLUSTER_BACKUP_DIR/persistentvolumes.yaml"
kubectl get persistentvolumeclaims -A -o yaml > "$CLUSTER_BACKUP_DIR/persistentvolumeclaims.yaml"
kubectl get storageclasses -o yaml > "$CLUSTER_BACKUP_DIR/storageclasses.yaml"
echo "Backing up etcd..."
backup_etcd "$CLUSTER_BACKUP_DIR"
echo "Cluster resources backed up to $CLUSTER_BACKUP_DIR"
# Upload cluster backup to restic
echo "Uploading cluster backup to restic..."
restic --verbose --tag wild-cloud --tag cluster --tag "$(date +%Y-%m-%d)" backup "$CLUSTER_BACKUP_DIR"
echo "Cluster backup completed."
else
echo "Skipping cluster backup."
fi
echo "Backup completed: $BACKUP_DIR"

View File

@@ -4,7 +4,8 @@ Keep your wild cloud running smoothly.
- [Security Best Practices](./guides/security.md)
- [Monitoring](./guides/monitoring.md)
- [Backup and Restore](./guides/backup-and-restore.md)
- [Making backups](./guides/making-backups.md)
- [Restoring backups](./guides/restoring-backups.md)
## Upgrade

View File

@@ -1,3 +0,0 @@
# Backup and Restore
TBD

View File

@@ -0,0 +1,265 @@
# Making Backups
This guide covers how to create backups of your wild-cloud infrastructure using the integrated backup system.
## Overview
The wild-cloud backup system creates encrypted, deduplicated snapshots using restic. It backs up three main components:
- **Applications**: Database dumps and persistent volume data
- **Cluster**: Kubernetes resources and etcd state
- **Configuration**: Wild-cloud repository and settings
## Prerequisites
Before making backups, ensure you have:
1. **Environment configured**: Run `source env.sh` to load backup configuration
2. **Restic repository**: Backup repository configured in `config.yaml`
3. **Backup password**: Set in wild-cloud secrets
4. **Staging directory**: Configured path for temporary backup files
## Backup Components
### Applications (`wild-app-backup`)
Backs up individual applications including:
- **Database dumps**: PostgreSQL/MySQL databases in compressed custom format
- **PVC data**: Application files streamed directly for restic deduplication
- **Auto-discovery**: Finds databases and PVCs based on app manifest.yaml
### Cluster Resources (`wild-backup --cluster-only`)
Backs up cluster-wide resources:
- **Kubernetes resources**: All pods, services, deployments, secrets, configmaps
- **Storage definitions**: PersistentVolumes, PVCs, StorageClasses
- **etcd snapshot**: Complete cluster state for disaster recovery
### Configuration (`wild-backup --home-only`)
Backs up wild-cloud configuration:
- **Repository contents**: All app definitions, manifests, configurations
- **Settings**: Wild-cloud configuration files and customizations
## Making Backups
### Full System Backup (Recommended)
Create a complete backup of everything:
```bash
# Backup all components (apps + cluster + config)
wild-backup
```
This is equivalent to:
```bash
wild-backup --home --apps --cluster
```
### Selective Backups
#### Applications Only
```bash
# All applications
wild-backup --apps-only
# Single application
wild-app-backup discourse
# Multiple applications
wild-app-backup discourse gitea immich
```
#### Cluster Only
```bash
# Kubernetes resources + etcd
wild-backup --cluster-only
```
#### Configuration Only
```bash
# Wild-cloud repository
wild-backup --home-only
```
### Excluding Components
Skip specific components:
```bash
# Skip config, backup apps + cluster
wild-backup --no-home
# Skip applications, backup config + cluster
wild-backup --no-apps
# Skip cluster resources, backup config + apps
wild-backup --no-cluster
```
## Backup Process Details
### Application Backup Process
1. **Discovery**: Parses `manifest.yaml` to find database and PVC dependencies
2. **Database backup**: Creates compressed custom-format dumps
3. **PVC backup**: Streams files directly to staging for restic deduplication
4. **Staging**: Organizes files in clean directory structure
5. **Upload**: Creates individual restic snapshots per application
### Cluster Backup Process
1. **Resource export**: Exports all Kubernetes resources to YAML
2. **etcd snapshot**: Creates point-in-time etcd backup via talosctl
3. **Upload**: Creates single restic snapshot for cluster state
### Restic Snapshots
Each backup creates tagged restic snapshots:
```bash
# View all snapshots
restic snapshots
# Filter by component
restic snapshots --tag discourse # Specific app
restic snapshots --tag cluster # Cluster resources
restic snapshots --tag wc-home # Wild-cloud config
```
## Where Backup Files Are Staged
Before uploading to your restic repository, backup files are organized in a staging directory. This temporary area lets you see exactly what's being backed up and helps with deduplication.
Here's what the staging area looks like:
```
backup-staging/
├── apps/
│ ├── discourse/
│ │ ├── database_20250816T120000Z.dump
│ │ ├── globals_20250816T120000Z.sql
│ │ └── discourse/
│ │ └── data/ # All the actual files
│ ├── gitea/
│ │ ├── database_20250816T120000Z.dump
│ │ └── gitea-data/
│ │ └── data/ # Git repositories, etc.
│ └── immich/
│ ├── database_20250816T120000Z.dump
│ └── immich-data/
│ └── upload/ # Photos and videos
└── cluster/
├── all-resources.yaml # All running services
├── secrets.yaml # Passwords and certificates
├── configmaps.yaml # Configuration data
└── etcd-snapshot.db # Complete cluster state
```
This staging approach means you can examine backup contents before they're uploaded, and restic can efficiently deduplicate files that haven't changed.
## Advanced Usage
### Custom Backup Scripts
Applications can provide custom backup logic:
```bash
# Create apps/myapp/backup.sh for custom behavior
chmod +x apps/myapp/backup.sh
# wild-app-backup will use custom script if present
wild-app-backup myapp
```
### Monitoring Backup Status
```bash
# Check recent snapshots
restic snapshots | head -20
# Check specific app backups
restic snapshots --tag discourse
# Verify backup integrity
restic check
```
### Backup Automation
Set up automated backups with cron:
```bash
# Daily full backup at 2 AM
0 2 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup
# Hourly app backups during business hours
0 9-17 * * * cd /data/repos/payne-cloud && source env.sh && wild-backup --apps-only
```
## Performance Considerations
### Large PVCs (like Immich photos)
The streaming backup approach provides:
- **First backup**: Full transfer time (all files processed)
- **Subsequent backups**: Only changed files processed (dramatically faster)
- **Storage efficiency**: Restic deduplication reduces storage usage
### Network Usage
- **Database dumps**: Compressed at source, efficient transfer
- **PVC data**: Uncompressed transfer, but restic handles deduplication
- **etcd snapshots**: Small files, minimal impact
## Troubleshooting
### Common Issues
**"No databases or PVCs found"**
- App has no `manifest.yaml` with database dependencies
- No PVCs with matching labels in app namespace
- Create custom `backup.sh` script for special cases
**"kubectl not found"**
- Ensure kubectl is installed and configured
- Check cluster connectivity with `kubectl get nodes`
**"Staging directory not set"**
- Configure `cloud.backup.staging` in `config.yaml`
- Ensure directory exists and is writable
**"Could not create etcd backup"**
- Ensure `talosctl` is installed for Talos clusters
- Check control plane node connectivity
- Verify etcd pods are accessible in kube-system namespace
### Backup Verification
Always verify backups periodically:
```bash
# Check restic repository integrity
restic check
# List recent snapshots
restic snapshots --compact
# Test restore to different directory
restic restore latest --target /tmp/restore-test
```
## Security Notes
- **Encryption**: All backups are encrypted with your backup password
- **Secrets**: Kubernetes secrets are included in cluster backups
- **Access control**: Secure your backup repository and passwords
- **Network**: Consider bandwidth usage for large initial backups
## Next Steps
- [Restoring Backups](restoring-backups.md) - Learn how to restore from backups
- Configure automated backup schedules
- Set up backup monitoring and alerting
- Test disaster recovery procedures

View File

@@ -0,0 +1,294 @@
# Restoring Backups
This guide will walk you through restoring your applications and cluster from wild-cloud backups. Hopefully you'll never need this, but when you do, it's critical that the process works smoothly.
## Understanding Restore Types
Your wild-cloud backup system can restore different types of data depending on what you need to recover:
**Application restores** bring back individual applications by restoring their database contents and file storage. This is what you'll use most often - maybe you accidentally deleted something in Discourse, or Gitea got corrupted, or you want to roll back Immich to before a bad update.
**Cluster restores** are for disaster recovery scenarios where you need to rebuild your entire Kubernetes cluster from scratch. This includes restoring all the cluster's configuration and even its internal state.
**Configuration restores** bring back your wild-cloud repository and settings, which contain all the "recipes" for how your infrastructure should be set up.
## Before You Start Restoring
Make sure you have everything needed to perform restores. You need to be in your wild-cloud directory with the environment loaded (`source env.sh`). Your backup repository and password should be configured and working - you can test this by running `restic snapshots` to see your available backups.
Most importantly, make sure you have kubectl access to your cluster, since restores involve creating temporary pods and manipulating storage.
## Restoring Applications
### Basic Application Restore
The most common restore scenario is bringing back a single application. To restore the latest backup of an app:
```bash
wild-app-restore discourse
```
This restores both the database and all file storage for the discourse app. The restore system automatically figures out what the app needs based on its manifest file and what was backed up.
If you want to restore from a specific backup instead of the latest:
```bash
wild-app-restore discourse abc123
```
Where `abc123` is the snapshot ID from `restic snapshots --tag discourse`.
### Partial Restores
Sometimes you only need to restore part of an application. Maybe the database is fine but the files got corrupted, or vice versa.
To restore only the database:
```bash
wild-app-restore discourse --db-only
```
To restore only the file storage:
```bash
wild-app-restore discourse --pvc-only
```
To restore without database roles and permissions (if they're causing conflicts):
```bash
wild-app-restore discourse --skip-globals
```
### Finding Available Backups
To see what backups are available for an app:
```bash
wild-app-restore discourse --list
```
This shows recent snapshots with their IDs, timestamps, and what was included.
## How Application Restores Work
Understanding what happens during a restore can help when things don't go as expected.
### Database Restoration
When restoring a database, the system first downloads the backup files from your restic repository. It then prepares the database by creating any needed roles, disconnecting existing users, and dropping/recreating the database to ensure a clean restore.
For PostgreSQL databases, it uses `pg_restore` with parallel processing to speed up large database imports. For MySQL, it uses standard mysql import commands. The system also handles database ownership and permissions automatically.
### File Storage Restoration
File storage (PVC) restoration is more complex because it involves safely replacing files that might be actively used by running applications.
First, the system creates a safety snapshot using Longhorn. This means if something goes wrong during the restore, you can get back to where you started. Then it scales your application down to zero replicas so no pods are using the storage.
Next, it creates a temporary utility pod with the PVC mounted and copies all the backup files into place, preserving file permissions and structure. Once the data is restored and verified, it removes the utility pod and scales your application back up.
If everything worked correctly, the safety snapshot is automatically deleted. If something went wrong, the safety snapshot is preserved so you can recover manually.
## Cluster Disaster Recovery
Cluster restoration is much less common but critical when you need to rebuild your entire infrastructure.
### Restoring Kubernetes Resources
To restore all cluster resources from a backup:
```bash
# Download cluster backup
restic restore --tag cluster latest --target ./restore/
# Apply all resources
kubectl apply -f restore/cluster/all-resources.yaml
```
You can also restore specific types of resources:
```bash
kubectl apply -f restore/cluster/secrets.yaml
kubectl apply -f restore/cluster/configmaps.yaml
```
### Restoring etcd State
**Warning: This is extremely dangerous and will affect your entire cluster.**
etcd restoration should only be done when rebuilding a cluster from scratch. For Talos clusters:
```bash
talosctl --nodes <control-plane-ip> etcd restore --from ./restore/cluster/etcd-snapshot.db
```
This command stops etcd, replaces its data with the backup, and restarts the cluster. Expect significant downtime while the cluster rebuilds itself.
## Common Disaster Recovery Scenarios
### Complete Application Loss
When an entire application is gone (namespace deleted, pods corrupted, etc.):
```bash
# Make sure the namespace exists
kubectl create namespace discourse --dry-run=client -o yaml | kubectl apply -f -
# Apply the application manifests if needed
kubectl apply -f apps/discourse/
# Restore the application data
wild-app-restore discourse
```
### Complete Cluster Rebuild
When rebuilding a cluster from scratch:
First, build your new cluster infrastructure and install wild-cloud components. Then configure backup access so you can reach your backup repository.
Restore cluster state:
```bash
restic restore --tag cluster latest --target ./restore/
# Apply etcd snapshot using appropriate method for your cluster type
```
Finally, restore all applications:
```bash
# See what applications are backed up
wild-app-restore --list
# Restore each application individually
wild-app-restore discourse
wild-app-restore gitea
wild-app-restore immich
```
### Rolling Back After Bad Changes
Sometimes you need to undo recent changes to an application:
```bash
# See available snapshots
wild-app-restore discourse --list
# Restore from before the problematic changes
wild-app-restore discourse abc123
```
## Cross-Cluster Migration
You can use backups to move applications between clusters:
On the source cluster, create a fresh backup:
```bash
wild-app-backup discourse
```
On the target cluster, deploy the application manifests:
```bash
kubectl apply -f apps/discourse/
```
Then restore the data:
```bash
wild-app-restore discourse
```
## Verifying Successful Restores
After any restore, verify that everything is working correctly.
For databases, check that you can connect and see expected data:
```bash
kubectl exec -n postgres deploy/postgres-deployment -- \
psql -U postgres -d discourse -c "SELECT count(*) FROM posts;"
```
For file storage, check that files exist and applications can start:
```bash
kubectl get pods -n discourse
kubectl logs -n discourse deployment/discourse
```
For web applications, test that you can access them:
```bash
curl -f https://discourse.example.com/latest.json
```
## When Things Go Wrong
### No Snapshots Found
If the restore system can't find backups for an application, check that snapshots exist:
```bash
restic snapshots --tag discourse
```
Make sure you're using the correct app name and that backups were actually created successfully.
### Database Restore Failures
Database restores can fail if the target database isn't accessible or if there are permission issues. Check that your postgres or mysql pods are running and that you can connect to them manually.
Review the restore error messages carefully - they usually indicate whether the problem is with the backup file, database connectivity, or permissions.
### PVC Restore Failures
If PVC restoration fails, check that you have sufficient disk space and that the PVC isn't being used by other pods. The error messages will usually indicate what went wrong.
Most importantly, remember that safety snapshots are preserved when PVC restores fail. You can see them with:
```bash
kubectl get snapshot.longhorn.io -n longhorn-system -l app=wild-app-restore
```
These snapshots let you recover to the pre-restore state if needed.
### Application Won't Start After Restore
If pods fail to start after restoration, check file permissions and ownership. Sometimes the restoration process doesn't perfectly preserve the exact permissions that the application expects.
You can also try scaling the application to zero and back to one, which sometimes resolves transient issues:
```bash
kubectl scale deployment/discourse -n discourse --replicas=0
kubectl scale deployment/discourse -n discourse --replicas=1
```
## Manual Recovery
When automated restore fails, you can always fall back to manual extraction and restoration:
```bash
# Extract backup files to local directory
restic restore --tag discourse latest --target ./manual-restore/
# Manually copy database dump to postgres pod
kubectl cp ./manual-restore/discourse/database_*.dump \
postgres/postgres-deployment-xxx:/tmp/
# Manually restore database
kubectl exec -n postgres deploy/postgres-deployment -- \
pg_restore -U postgres -d discourse /tmp/database_*.dump
```
For file restoration, you'd need to create a utility pod and manually copy files into the PVC.
## Best Practices
Test your restore procedures regularly in a non-production environment. It's much better to discover issues with your backup system during a planned test than during an actual emergency.
Always communicate with users before performing restores, especially if they involve downtime. Document any manual steps you had to take so you can improve the automated process.
After any significant restore, monitor your applications more closely than usual for a few days. Sometimes problems don't surface immediately.
## Security and Access Control
Restore operations are powerful and can be destructive. Make sure only trusted administrators can perform restores, and consider requiring approval or coordination before major restoration operations.
Be aware that cluster restores include all secrets, so they potentially expose passwords, API keys, and certificates. Ensure your backup repository is properly secured.
Remember that Longhorn safety snapshots are preserved when things go wrong. These snapshots may contain sensitive data, so clean them up appropriately once you've resolved any issues.
## What's Next
The best way to get comfortable with restore operations is to practice them in a safe environment. Set up a test cluster and practice restoring applications and data.
Consider creating runbooks for your most likely disaster scenarios, including the specific commands and verification steps for your infrastructure.
Read the [Making Backups](making-backups.md) guide to ensure you're creating the backups you'll need for successful recovery.