Compare commits
10 Commits
c7b29e5954
...
343f33173a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
343f33173a | ||
|
|
e56d981d74 | ||
|
|
f94f80cd2a | ||
|
|
d06e27931c | ||
|
|
748ae1a70b | ||
|
|
482cebc603 | ||
|
|
fe6857e888 | ||
|
|
395bdff2a6 | ||
|
|
d21eb18dc9 | ||
|
|
ecdb2f2916 |
@@ -7,6 +7,7 @@ containo
|
||||
controlplane
|
||||
coredns
|
||||
crds
|
||||
direnv
|
||||
dnsmasq
|
||||
envsubst
|
||||
externaldns
|
||||
@@ -19,6 +20,7 @@ ipxe
|
||||
Jellyfin
|
||||
keepalives
|
||||
KUBECONFIG
|
||||
kubelet
|
||||
kubernetescrd
|
||||
kustomization
|
||||
letsencrypt
|
||||
@@ -39,9 +41,11 @@ pgvector
|
||||
rcode
|
||||
restic
|
||||
SAMEORIGIN
|
||||
talosconfig
|
||||
talosctl
|
||||
TALOSCTL
|
||||
traefik
|
||||
urandom
|
||||
USEPATH
|
||||
vxlan
|
||||
websecure
|
||||
|
||||
@@ -220,8 +220,7 @@ This approach prevents naming conflicts between apps and makes secret keys more
|
||||
Apps in Wild Cloud are managed by operators using a set of commands run from their Wild Cloud home directory.
|
||||
|
||||
- `wild-apps-list`: Lists all available apps.
|
||||
- `wild-app-fetch <app-name>`: Fetches the latest app files from the Wild Cloud repository and stores them in your Wild Cloud cache.
|
||||
- `wild-app-add <app-name>`: Adds the app manifest to your Wild Cloud home `apps` directory, updates missing values in `config.yaml` and `secrets.yaml` with the app's default configurations, and compiles the app's Kustomize files.
|
||||
- `wild-app-add <app-name>`: Reads the app from the Wild Cloud repository, adds the app manifest to your Wild Cloud home `apps` directory, updates missing values in `config.yaml` and `secrets.yaml` with the app's default configurations, and compiles the app's Kustomize files.
|
||||
- `wild-app-deploy <app-name>`: Deploys the app to your Wild Cloud.
|
||||
|
||||
## Contributing
|
||||
|
||||
@@ -5,8 +5,7 @@ metadata:
|
||||
namespace: gitea
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
|
||||
external-dns.alpha.kubernetes.io/target: "{{ .apps.gitea.domain }}"
|
||||
external-dns.alpha.kubernetes.io/target: "{{ .cluster.externalDns.target}}"
|
||||
external-dns.alpha.kubernetes.io/target: "{{ .cloud.domain }}"
|
||||
spec:
|
||||
rules:
|
||||
- host: "{{ .apps.gitea.domain }}"
|
||||
|
||||
@@ -4,7 +4,7 @@ kind: Ingress
|
||||
metadata:
|
||||
name: immich-public
|
||||
annotations:
|
||||
external-dns.alpha.kubernetes.io/target: "{{ .apps.immich.domain }}"
|
||||
external-dns.alpha.kubernetes.io/target: "{{ .cloud.domain }}"
|
||||
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
|
||||
spec:
|
||||
rules:
|
||||
|
||||
@@ -8,23 +8,23 @@ UPDATE=false
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--update)
|
||||
--force)
|
||||
UPDATE=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
echo "Usage: $0 <app_name> [--force]"
|
||||
echo ""
|
||||
echo "Configure an app by applying templates and merging configuration."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --update Overwrite existing app files without confirmation"
|
||||
echo " --force Overwrite existing app files without confirmation"
|
||||
echo " -h, --help Show this help message"
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option $1"
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
echo "Usage: $0 <app_name> [--force]"
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
@@ -32,7 +32,7 @@ while [[ $# -gt 0 ]]; do
|
||||
APP_NAME="$1"
|
||||
else
|
||||
echo "Too many arguments"
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
echo "Usage: $0 <app_name> [--force]"
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
@@ -41,7 +41,7 @@ while [[ $# -gt 0 ]]; do
|
||||
done
|
||||
|
||||
if [ -z "${APP_NAME}" ]; then
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
echo "Usage: $0 <app_name> [--force]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -71,21 +71,14 @@ if [ ! -f "${SECRETS_FILE}" ]; then
|
||||
echo "" >> "${SECRETS_FILE}"
|
||||
fi
|
||||
|
||||
# Check if app is cached, if not fetch it first
|
||||
CACHE_APP_DIR="${WC_HOME}/.wildcloud/cache/apps/${APP_NAME}"
|
||||
if [ ! -d "${CACHE_APP_DIR}" ]; then
|
||||
echo "Cache directory for app '${APP_NAME}' not found at '${CACHE_APP_DIR}'."
|
||||
echo "Please fetch the app first using 'wild-app-fetch ${APP_NAME}'."
|
||||
# Check if app exists in repository
|
||||
SOURCE_APP_DIR="${WC_ROOT}/apps/${APP_NAME}"
|
||||
if [ ! -d "${SOURCE_APP_DIR}" ]; then
|
||||
echo "Error: App '${APP_NAME}' not found at ${SOURCE_APP_DIR}"
|
||||
echo "Available apps:"
|
||||
ls -1 "${WC_ROOT}/apps" | grep -v README.md | sed 's/^/ - /'
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -d "${CACHE_APP_DIR}" ]; then
|
||||
echo "App '${APP_NAME}' not found in cache, fetching..."
|
||||
if [ "${UPDATE}" = true ]; then
|
||||
./bin/wild-app-fetch "${APP_NAME}" --update
|
||||
else
|
||||
./bin/wild-app-fetch "${APP_NAME}"
|
||||
fi
|
||||
fi
|
||||
|
||||
APPS_DIR="${WC_HOME}/apps"
|
||||
if [ ! -d "${APPS_DIR}" ]; then
|
||||
@@ -113,12 +106,12 @@ else
|
||||
fi
|
||||
mkdir -p "${DEST_APP_DIR}"
|
||||
|
||||
# Step 1: Copy only manifest.yaml from cache first
|
||||
MANIFEST_FILE="${CACHE_APP_DIR}/manifest.yaml"
|
||||
# Step 1: Copy manifest.yaml from repository first
|
||||
MANIFEST_FILE="${SOURCE_APP_DIR}/manifest.yaml"
|
||||
if [ -f "${MANIFEST_FILE}" ]; then
|
||||
# manifest.yaml is allowed to have gomplate variables in the defaultConfig and requiredSecrets sections.
|
||||
# We need to use gomplate to process these variables before using yq.
|
||||
echo "Copying app manifest from cache."
|
||||
echo "Processing app manifest."
|
||||
DEST_MANIFEST="${DEST_APP_DIR}/manifest.yaml"
|
||||
if [ -f "${SECRETS_FILE}" ]; then
|
||||
gomplate_cmd="gomplate -c .=${CONFIG_FILE} -c secrets=${SECRETS_FILE} -f ${MANIFEST_FILE} -o ${DEST_MANIFEST}"
|
||||
@@ -130,7 +123,7 @@ if [ -f "${MANIFEST_FILE}" ]; then
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Warning: App manifest not found in cache."
|
||||
echo "Error: App manifest not found at ${MANIFEST_FILE}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -185,10 +178,10 @@ if yq eval '.requiredSecrets' "${DEST_MANIFEST}" | grep -q -v '^null$'; then
|
||||
echo "Required secrets declared in app manifest added to '${SECRETS_FILE}'."
|
||||
fi
|
||||
|
||||
# Step 3: Copy and compile all other files from cache to app directory
|
||||
echo "Copying and compiling remaining files from cache."
|
||||
# Step 3: Copy and compile all files from repository to app directory
|
||||
echo "Copying and compiling app files."
|
||||
|
||||
cp -r "${CACHE_APP_DIR}/." "${DEST_APP_DIR}/"
|
||||
cp -r "${SOURCE_APP_DIR}/." "${DEST_APP_DIR}/"
|
||||
find "${DEST_APP_DIR}" -type f | while read -r dest_file; do
|
||||
rel_path="${dest_file#${DEST_APP_DIR}/}"
|
||||
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
UPDATE=false
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--update)
|
||||
UPDATE=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
echo ""
|
||||
echo "Fetch an app template from the Wild Cloud repository to cache."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --update Overwrite existing cached files without confirmation"
|
||||
echo " -h, --help Show this help message"
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option $1"
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
if [ -z "${APP_NAME}" ]; then
|
||||
APP_NAME="$1"
|
||||
else
|
||||
echo "Too many arguments"
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "${APP_NAME}" ]; then
|
||||
echo "Usage: $0 <app_name> [--update]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
echo "WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
SOURCE_APP_DIR="${WC_ROOT}/apps/${APP_NAME}"
|
||||
if [ ! -d "${SOURCE_APP_DIR}" ]; then
|
||||
echo "Error: App '${APP_NAME}' not found at ${SOURCE_APP_DIR}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CACHE_APP_DIR=".wildcloud/cache/apps/${APP_NAME}"
|
||||
mkdir -p ".wildcloud/cache/apps"
|
||||
|
||||
if [ -d "${CACHE_APP_DIR}" ]; then
|
||||
if [ "${UPDATE}" = true ]; then
|
||||
echo "Updating cached app '${APP_NAME}'"
|
||||
rm -rf "${CACHE_APP_DIR}"
|
||||
else
|
||||
echo "Warning: Cache directory ${CACHE_APP_DIR} already exists"
|
||||
read -p "Do you want to overwrite it? (y/N): " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
echo "Fetch cancelled"
|
||||
exit 1
|
||||
fi
|
||||
rm -rf "${CACHE_APP_DIR}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Fetching app '${APP_NAME}' from ${SOURCE_APP_DIR} to ${CACHE_APP_DIR}"
|
||||
|
||||
# Create destination directory
|
||||
mkdir -p "${CACHE_APP_DIR}"
|
||||
|
||||
# Copy directory structure and files (no template processing)
|
||||
find "${SOURCE_APP_DIR}" -type d | while read -r src_dir; do
|
||||
rel_path="${src_dir#${SOURCE_APP_DIR}}"
|
||||
rel_path="${rel_path#/}" # Remove leading slash if present
|
||||
if [ -n "${rel_path}" ]; then
|
||||
mkdir -p "${CACHE_APP_DIR}/${rel_path}"
|
||||
fi
|
||||
done
|
||||
|
||||
find "${SOURCE_APP_DIR}" -type f | while read -r src_file; do
|
||||
rel_path="${src_file#${SOURCE_APP_DIR}}"
|
||||
rel_path="${rel_path#/}" # Remove leading slash if present
|
||||
dest_file="${CACHE_APP_DIR}/${rel_path}"
|
||||
|
||||
# Ensure destination directory exists
|
||||
dest_dir=$(dirname "${dest_file}")
|
||||
mkdir -p "${dest_dir}"
|
||||
|
||||
# Simple copy without template processing
|
||||
cp "${src_file}" "${dest_file}"
|
||||
done
|
||||
|
||||
echo "Successfully fetched app '${APP_NAME}' to cache"
|
||||
@@ -186,8 +186,7 @@ elif [ "${OUTPUT_FORMAT}" = "table" ]; then
|
||||
echo "Total installable apps: ${app_count}"
|
||||
echo ""
|
||||
echo "Usage:"
|
||||
echo " wild-app-fetch <app> # Fetch app template to cache"
|
||||
echo " wild-app-config <app> # Configure app with your settings"
|
||||
echo " wild-app-add <app> # Configure app with your settings"
|
||||
echo " wild-app-deploy <app> # Deploy app to Kubernetes"
|
||||
fi
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ prompt_if_unset_config "cluster.nodes.talos.version" "Talos version" "v1.11.0"
|
||||
TALOS_VERSION=$(wild-config "cluster.nodes.talos.version")
|
||||
|
||||
# Talos schematic ID
|
||||
prompt_if_unset_config "cluster.nodes.talos.schematicId" "Talos schematic ID" "56774e0894c8a3a3a9834a2aea65f24163cacf9506abbcbdc3ba135eaca4953f"
|
||||
prompt_if_unset_config "cluster.nodes.talos.schematicId" "Talos schematic ID" "434a0300db532066f1098e05ac068159371d00f0aba0a3103a0e826e83825c82"
|
||||
SCHEMATIC_ID=$(wild-config "cluster.nodes.talos.schematicId")
|
||||
|
||||
print_info "Creating custom Talos installer image..."
|
||||
|
||||
@@ -1,170 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Usage function
|
||||
usage() {
|
||||
echo "Usage: wild-cluster-node-patch-generate <node-name>"
|
||||
echo ""
|
||||
echo "Generate Talos machine configuration patches for a specific registered node."
|
||||
echo ""
|
||||
echo "Arguments:"
|
||||
echo " node-name Name of the registered node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " wild-cluster-node-patch-generate control-1"
|
||||
echo " wild-cluster-node-patch-generate worker-1"
|
||||
echo ""
|
||||
echo "This script will:"
|
||||
echo " - Compile patch templates for the specified node"
|
||||
echo " - Generate node-specific patch files in WC_HOME/setup/cluster-nodes/patch/"
|
||||
echo " - Use hardware details from the node registration"
|
||||
echo ""
|
||||
echo "Requirements:"
|
||||
echo " - Must be run from a wild-cloud directory"
|
||||
echo " - Node must be registered (hardware detected) first"
|
||||
echo " - Basic cluster configuration must be completed"
|
||||
echo " - Patch templates must exist in WC_ROOT/setup/cluster-nodes/"
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
NODE_NAME=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
if [ -z "$NODE_NAME" ]; then
|
||||
NODE_NAME="$1"
|
||||
else
|
||||
echo "Unexpected argument: $1"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if node name was provided
|
||||
if [ -z "$NODE_NAME" ]; then
|
||||
echo "Error: Node name is required"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
print "WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
prompt_if_unset_config "cluster.name" "Cluster name" "local.example.com"
|
||||
|
||||
# Function to ensure required directories exist in WC_HOME
|
||||
ensure_required_directories() {
|
||||
# Create output directories in WC_HOME for patch configs
|
||||
mkdir -p "${WC_HOME}/setup/cluster-nodes/patch"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PATCH GENERATION
|
||||
# =============================================================================
|
||||
|
||||
print_header "Talos Machine Config Patch Generation"
|
||||
|
||||
# Ensure required directories exist in WC_HOME
|
||||
ensure_required_directories
|
||||
|
||||
# Define directories
|
||||
TEMPLATE_SOURCE_DIR="${WC_ROOT}/setup/cluster-nodes"
|
||||
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
|
||||
|
||||
# Check if cluster has been initialized
|
||||
if [ ! -f "${NODE_SETUP_DIR}/generated/secrets.yaml" ]; then
|
||||
print_error "Cluster not initialized. Base cluster configuration is required."
|
||||
print_info "Run 'wild-cluster-config-generate' first to generate cluster secrets and base configs"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get cluster configuration from config.yaml
|
||||
CLUSTER_NAME=$(wild-config cluster.name)
|
||||
|
||||
print_info "Generating patch for node: $NODE_NAME"
|
||||
print_info "Cluster: $CLUSTER_NAME"
|
||||
|
||||
# Check if the specified node is registered
|
||||
NODE_INTERFACE=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".interface" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
NODE_DISK=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".disk" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
NODE_ROLE=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".role" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
NODE_CURRENT_IP=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".currentIp" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
|
||||
if [ -z "$NODE_INTERFACE" ] || [ "$NODE_INTERFACE" = "null" ]; then
|
||||
print_error "Node $NODE_NAME is not registered in config.yaml"
|
||||
print_info "Please register the node first by running node hardware detection"
|
||||
print_info "Or run 'wild-setup-cluster' to register nodes interactively"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get current IP for the node
|
||||
if [ -z "$NODE_CURRENT_IP" ] || [ "$NODE_CURRENT_IP" = "null" ]; then
|
||||
print_error "Node $NODE_NAME has no current IP address set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Determine node type
|
||||
if [ "$NODE_ROLE" = "controlplane" ]; then
|
||||
NODE_TYPE="control"
|
||||
print_success "Registered control plane node: $NODE_NAME"
|
||||
else
|
||||
NODE_TYPE="worker"
|
||||
print_success "Registered worker node: $NODE_NAME"
|
||||
fi
|
||||
|
||||
print_info "Node details:"
|
||||
print_info " - Name: $NODE_NAME"
|
||||
print_info " - Current IP: $NODE_CURRENT_IP"
|
||||
print_info " - Interface: $NODE_INTERFACE"
|
||||
print_info " - Disk: $NODE_DISK"
|
||||
print_info " - Type: $NODE_TYPE"
|
||||
|
||||
# Compile patch template for the specified node
|
||||
print_info "Compiling patch template for $NODE_TYPE node $NODE_NAME..."
|
||||
|
||||
if [ "$NODE_TYPE" = "control" ]; then
|
||||
TEMPLATE_FILE="${TEMPLATE_SOURCE_DIR}/patch.templates/controlplane.yaml"
|
||||
else
|
||||
TEMPLATE_FILE="${TEMPLATE_SOURCE_DIR}/patch.templates/worker.yaml"
|
||||
fi
|
||||
|
||||
# Use node name as the patch name
|
||||
PATCH_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml"
|
||||
|
||||
# Create a temporary template with the node name and IP for gomplate processing
|
||||
TEMP_TEMPLATE="/tmp/${NODE_NAME//\//_}-$(date +%s).yaml"
|
||||
sed -e "s/{{NODE_NAME}}/${NODE_NAME}/g" -e "s/{{NODE_IP}}/${NODE_CURRENT_IP}/g" "$TEMPLATE_FILE" > "$TEMP_TEMPLATE"
|
||||
cat "$TEMP_TEMPLATE" | wild-compile-template > "$PATCH_FILE"
|
||||
rm -f "$TEMP_TEMPLATE"
|
||||
|
||||
print_success "Patch generated successfully!"
|
||||
echo ""
|
||||
print_info "Generated patch file:"
|
||||
print_info " - $PATCH_FILE"
|
||||
echo ""
|
||||
print_info "Template used: ${TEMPLATE_FILE}"
|
||||
|
||||
print_success "Patch generation completed!"
|
||||
@@ -1,267 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Usage function
|
||||
usage() {
|
||||
echo "Usage: wild-cluster-node-up <node-name> [options]"
|
||||
echo ""
|
||||
echo "Apply Talos machine configuration to a registered node."
|
||||
echo ""
|
||||
echo "Arguments:"
|
||||
echo " node-name Name of the registered node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -i, --insecure Apply configuration in insecure mode (for maintenance mode nodes)"
|
||||
echo " --force Force regeneration of final config even if it exists"
|
||||
echo " --dry-run Show the command that would be executed without running it"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " wild-cluster-node-up control-1"
|
||||
echo " wild-cluster-node-up worker-1 --insecure"
|
||||
echo " wild-cluster-node-up worker-2 --skip-patch"
|
||||
echo " wild-cluster-node-up control-2 --force"
|
||||
echo " wild-cluster-node-up control-1 --dry-run"
|
||||
echo ""
|
||||
echo "This script will:"
|
||||
echo " - Verify the node is registered in config.yaml"
|
||||
echo " - Generate final machine configuration if needed"
|
||||
echo " - Apply the configuration using talosctl apply-config"
|
||||
echo " - Use insecure mode for nodes in maintenance mode"
|
||||
echo ""
|
||||
echo "Requirements:"
|
||||
echo " - Must be run from a wild-cloud directory"
|
||||
echo " - Node must be registered (hardware detected) first"
|
||||
echo " - Base cluster configuration and patch file must exist for the node"
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
NODE_NAME=""
|
||||
INSECURE_MODE=false
|
||||
DRY_RUN=false
|
||||
SKIP_PATCH=false
|
||||
FORCE_REGENERATE=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-i|--insecure)
|
||||
INSECURE_MODE=true
|
||||
shift
|
||||
;;
|
||||
--force)
|
||||
FORCE_REGENERATE=true
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
if [ -z "$NODE_NAME" ]; then
|
||||
NODE_NAME="$1"
|
||||
else
|
||||
echo "Unexpected argument: $1"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if node name was provided
|
||||
if [ -z "$NODE_NAME" ]; then
|
||||
echo "Error: Node name is required"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
print "WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
print_header "Talos node configuration"
|
||||
|
||||
# Check if the specified node is registered
|
||||
NODE_INTERFACE=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".interface" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
NODE_DISK=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".disk" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
NODE_ROLE=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".role" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
NODE_CURRENT_IP=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".currentIp" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
MAINTENANCE_IP=$(yq eval ".cluster.nodes.active.\"${NODE_NAME}\".maintenanceIp" "${WC_HOME}/config.yaml" 2>/dev/null)
|
||||
|
||||
if [ -z "$NODE_INTERFACE" ] || [ "$NODE_INTERFACE" = "null" ]; then
|
||||
print_error "Node $NODE_NAME is not registered in config.yaml"
|
||||
print_info "Please register the node first by running:"
|
||||
print_info "Or run 'wild-setup-cluster' to register nodes interactively"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get current IP for the node
|
||||
if [ -z "$NODE_CURRENT_IP" ] || [ "$NODE_CURRENT_IP" = "null" ]; then
|
||||
print_error "Node $NODE_NAME has no current IP address set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Determine node type
|
||||
if [ "$NODE_ROLE" = "controlplane" ]; then
|
||||
NODE_TYPE="control plane"
|
||||
else
|
||||
NODE_TYPE="worker"
|
||||
fi
|
||||
|
||||
# Determine the target IP for applying configuration
|
||||
if [ -n "$MAINTENANCE_IP" ] && [ "$MAINTENANCE_IP" != "null" ]; then
|
||||
TARGET_IP="$MAINTENANCE_IP"
|
||||
print_info "Applying configuration to $NODE_TYPE node: $NODE_NAME ($NODE_CURRENT_IP) via maintenance IP: $MAINTENANCE_IP"
|
||||
# Auto-enable insecure mode when using maintenance IP (unless explicitly overridden)
|
||||
if [ "$INSECURE_MODE" = false ]; then
|
||||
INSECURE_MODE=true
|
||||
print_info "Auto-enabling insecure mode for maintenance IP"
|
||||
fi
|
||||
else
|
||||
TARGET_IP="$NODE_CURRENT_IP"
|
||||
print_info "Applying configuration to $NODE_TYPE node: $NODE_NAME ($NODE_CURRENT_IP)"
|
||||
fi
|
||||
|
||||
print_info "Node details:"
|
||||
print_info " - Name: $NODE_NAME"
|
||||
print_info " - Current IP: $NODE_CURRENT_IP"
|
||||
print_info " - Interface: $NODE_INTERFACE"
|
||||
print_info " - Disk: $NODE_DISK"
|
||||
print_info " - Type: $NODE_TYPE"
|
||||
if [ -n "$MAINTENANCE_IP" ] && [ "$MAINTENANCE_IP" != "null" ]; then
|
||||
print_info " - Maintenance IP: $MAINTENANCE_IP"
|
||||
fi
|
||||
|
||||
# Check if machine config exists, generate if needed
|
||||
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
|
||||
CONFIG_FILE="${NODE_SETUP_DIR}/final/${NODE_NAME}.yaml"
|
||||
PATCH_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml"
|
||||
|
||||
# Check if patch file exists
|
||||
if [ ! -f "$PATCH_FILE" ]; then
|
||||
wild-cluster-node-patch-generate "$NODE_NAME"
|
||||
fi
|
||||
|
||||
# Determine base config file
|
||||
if [ "$NODE_ROLE" = "controlplane" ]; then
|
||||
BASE_CONFIG="${NODE_SETUP_DIR}/generated/controlplane.yaml"
|
||||
else
|
||||
BASE_CONFIG="${NODE_SETUP_DIR}/generated/worker.yaml"
|
||||
fi
|
||||
|
||||
# Check if base config exists
|
||||
if [ ! -f "$BASE_CONFIG" ]; then
|
||||
print_error "Base configuration not found: $BASE_CONFIG"
|
||||
print_info "Generate base cluster configuration first:"
|
||||
print_info " wild-cluster-config-generate"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if we should skip regeneration
|
||||
if [ ! -f "$CONFIG_FILE" ] || [ "$FORCE_REGENERATE" = true ]; then
|
||||
# Need to generate/regenerate the final config
|
||||
if [ "$FORCE_REGENERATE" = true ]; then
|
||||
print_info "Force regeneration requested: regenerating machine configuration..."
|
||||
else
|
||||
print_info "Machine configuration not found: $CONFIG_FILE"
|
||||
print_info "Generating final machine configuration..."
|
||||
fi
|
||||
|
||||
# Create final config directory if it doesn't exist
|
||||
mkdir -p "${NODE_SETUP_DIR}/final"
|
||||
|
||||
# Generate final machine config
|
||||
print_info "Generating final machine configuration from patch..."
|
||||
talosctl machineconfig patch "$BASE_CONFIG" --patch @"$PATCH_FILE" -o "$CONFIG_FILE"
|
||||
print_success "Generated machine configuration: $CONFIG_FILE"
|
||||
else
|
||||
print_success "Found existing machine configuration: $CONFIG_FILE"
|
||||
fi
|
||||
|
||||
# Build talosctl command
|
||||
TALOSCTL_CMD="talosctl apply-config"
|
||||
|
||||
if [ "$INSECURE_MODE" = true ]; then
|
||||
TALOSCTL_CMD="$TALOSCTL_CMD --insecure"
|
||||
print_info "Using insecure mode (for maintenance mode nodes)"
|
||||
fi
|
||||
|
||||
TALOSCTL_CMD="$TALOSCTL_CMD --nodes $TARGET_IP --file $CONFIG_FILE"
|
||||
|
||||
# Show the command
|
||||
echo ""
|
||||
print_info "Command to execute:"
|
||||
echo " $TALOSCTL_CMD"
|
||||
echo ""
|
||||
|
||||
if [ "$DRY_RUN" = true ]; then
|
||||
print_info "Dry run mode - command shown above but not executed"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Apply the configuration
|
||||
print_info "Applying machine configuration..."
|
||||
echo ""
|
||||
|
||||
if eval "$TALOSCTL_CMD"; then
|
||||
print_success "Machine configuration applied successfully!"
|
||||
|
||||
# Update talosctl context to this node
|
||||
print_info "Updating talosctl context..."
|
||||
talosctl config node "$NODE_CURRENT_IP"
|
||||
print_success "Updated talosctl context to node $NODE_NAME ($NODE_CURRENT_IP)"
|
||||
echo ""
|
||||
|
||||
if [ "$NODE_ROLE" = "controlplane" ]; then
|
||||
print_info "Next steps for control plane node:"
|
||||
echo " 1. Wait for the node to reboot and come up with the new configuration"
|
||||
echo " 2. If this is your first control plane node, bootstrap it:"
|
||||
echo " talosctl bootstrap --nodes $NODE_CURRENT_IP"
|
||||
echo " 3. Get kubeconfig when cluster is ready:"
|
||||
echo " talosctl kubeconfig"
|
||||
else
|
||||
print_info "Next steps for worker node:"
|
||||
echo " 1. Wait for the node to reboot and come up with the new configuration"
|
||||
echo " 2. Node will join the cluster automatically"
|
||||
echo " 3. Verify the node appears in the cluster:"
|
||||
echo " kubectl get nodes"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
print_info "Monitor node status with:"
|
||||
echo " talosctl --nodes $NODE_CURRENT_IP dmesg"
|
||||
echo " talosctl --nodes $NODE_CURRENT_IP get members"
|
||||
|
||||
else
|
||||
print_error "Failed to apply machine configuration"
|
||||
echo ""
|
||||
print_info "Troubleshooting tips:"
|
||||
if [ -n "$MAINTENANCE_IP" ] && [ "$MAINTENANCE_IP" != "null" ]; then
|
||||
echo " - Ensure the node is accessible at maintenance IP $MAINTENANCE_IP"
|
||||
else
|
||||
echo " - Ensure the node is accessible at $NODE_CURRENT_IP"
|
||||
fi
|
||||
echo " - For nodes in maintenance mode, use --insecure flag"
|
||||
echo " - Check network connectivity and firewall settings"
|
||||
echo " - Verify the machine configuration file is valid"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "Node configuration completed!"
|
||||
@@ -32,13 +32,17 @@ else
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
# ---
|
||||
# Config
|
||||
|
||||
prompt_if_unset_config "cloud.dns.ip" "The IP address of your wild cloud DNS server" ""
|
||||
prompt_if_unset_config "cloud.dnsmasq.interface" "The network interface for your wild cloud DNS server" "eth0"
|
||||
prompt_if_unset_config "cluster.loadBalancerIp" "The IP address for your cluster load balancer" ""
|
||||
prompt_if_unset_config "cloud.router.ip" "The IP address for your LAN router" "192.168.8.1"
|
||||
prompt_if_unset_config "cloud.dhcpRange" "The DHCP range for your wild cloud network" ""
|
||||
|
||||
SOURCE_DIR="${WC_ROOT}/setup/dnsmasq"
|
||||
DNSMASQ_SETUP_DIR="${WC_HOME}/setup/dnsmasq"
|
||||
BUNDLE_DIR="${DNSMASQ_SETUP_DIR}/setup-bundle"
|
||||
mkdir -p "${BUNDLE_DIR}"
|
||||
|
||||
|
||||
# Create local templates.
|
||||
|
||||
@@ -58,6 +62,7 @@ if [ -d "${DNSMASQ_SETUP_DIR}" ]; then
|
||||
echo "Successfully created dnsmasq setup files from templates."
|
||||
fi
|
||||
else
|
||||
mkdir -p "${BUNDLE_DIR}"
|
||||
cp -r "${SOURCE_DIR}" "${DNSMASQ_SETUP_DIR}"
|
||||
find "${DNSMASQ_SETUP_DIR}" -type f \( -name "*.yaml" -o -name "*.ipxe" -o -name "*.conf" \) | while read -r file; do
|
||||
echo "Processing: ${file}"
|
||||
@@ -65,6 +70,7 @@ else
|
||||
done
|
||||
echo "Successfully created dnsmasq setup files from templates."
|
||||
fi
|
||||
mkdir -p "${BUNDLE_DIR}"
|
||||
|
||||
# Create setup bundle.
|
||||
|
||||
|
||||
@@ -102,14 +102,15 @@ prompt_if_unset_config "cloud.internalDomain" "Your internal cloud domain" "inte
|
||||
prompt_if_unset_config "cloud.backup.root" "Existing path to save backups to" ""
|
||||
|
||||
# Derive cluster name from domain if not already set
|
||||
current_cluster_name=$(wild-config "cluster.name")
|
||||
if [ -z "$current_cluster_name" ] || [ "$current_cluster_name" = "null" ]; then
|
||||
if wild-config "cluster.name" --check; then
|
||||
echo "Cluster name: $(wild-config "cluster.name")"
|
||||
else
|
||||
echo "Cluster name is already set to: $(wild-config "cluster.name")"
|
||||
cluster_name=$(echo "${domain}" | tr '.' '-' | tr '[:upper:]' '[:lower:]')
|
||||
wild-config-set "cluster.name" "${cluster_name}"
|
||||
print_info "Set cluster name to: ${cluster_name}"
|
||||
fi
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# COPY SCAFFOLD
|
||||
# =============================================================================
|
||||
|
||||
@@ -26,7 +26,7 @@ usage() {
|
||||
echo " - Return JSON with hardware information"
|
||||
echo ""
|
||||
echo "Output JSON format:"
|
||||
echo ' {"interface": "eth0", "disks": ["/dev/sda", "/dev/nvme0n1"], "selected_disk": "/dev/sda"}'
|
||||
echo ' {"interface": "eth0", "disks": ["/dev/sda", "/dev/nvme0n1"], "selected_disk": "/dev/sda", "maintenance_mode": true}'
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
@@ -152,12 +152,19 @@ echo "✅ Discovered $(echo "$AVAILABLE_DISKS" | jq -r 'length') suitable disks"
|
||||
echo "✅ Selected disk: $SELECTED_DISK" >&2
|
||||
|
||||
# Output JSON to stdout
|
||||
MAINTENANCE_MODE_BOOL="false"
|
||||
if [ "$TALOS_MODE" = "insecure" ]; then
|
||||
MAINTENANCE_MODE_BOOL="true"
|
||||
fi
|
||||
|
||||
jq -n \
|
||||
--arg interface "$ACTIVE_INTERFACE" \
|
||||
--argjson disks "$AVAILABLE_DISKS" \
|
||||
--arg selected_disk "$SELECTED_DISK" \
|
||||
--argjson maintenance_mode "$MAINTENANCE_MODE_BOOL" \
|
||||
'{
|
||||
interface: $interface,
|
||||
disks: $disks,
|
||||
selected_disk: $selected_disk
|
||||
selected_disk: $selected_disk,
|
||||
maintenance_mode: $maintenance_mode
|
||||
}'
|
||||
313
bin/wild-node-setup
Executable file
313
bin/wild-node-setup
Executable file
@@ -0,0 +1,313 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Set up configuration variables.
|
||||
# Generate Talos machine configuration
|
||||
# Apply configuration to node
|
||||
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# Usage function
|
||||
usage() {
|
||||
echo "Usage: wild-node-setup <node-name> [options]"
|
||||
echo ""
|
||||
echo "Complete node lifecycle management - configure → patch → deploy"
|
||||
echo ""
|
||||
echo "Arguments:"
|
||||
echo " node-name Name of the node to setup"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --reconfigure Force node reconfiguration"
|
||||
echo " --no-deploy Generate Talos machine configuration only, skip deployment"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " wild-node-setup control-1"
|
||||
echo " wild-node-setup worker-1 --reconfigure"
|
||||
echo " wild-node-setup control-2 --no-deploy"
|
||||
echo ""
|
||||
echo "This script handles the complete node setup lifecycle:"
|
||||
echo " 1. Node configuration (if needed or --reconfigure specified)"
|
||||
echo " 2. Generate node-specific configuration patch"
|
||||
echo " 3. Create final machine configuration"
|
||||
echo " 4. Deploy configuration to node (unless --no-deploy)"
|
||||
echo ""
|
||||
echo "Requirements:"
|
||||
echo " - Must be run from a Wild Cloud home directory"
|
||||
echo " - Cluster must be initialized (wild-cluster-config-generate)"
|
||||
echo " - Node must be accessible for configuration"
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
NODE_NAME=""
|
||||
FORCE_CONFIG=false
|
||||
NO_DEPLOY=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--reconfigure)
|
||||
FORCE_CONFIG=true
|
||||
shift
|
||||
;;
|
||||
--no-deploy)
|
||||
NO_DEPLOY=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
if [ -z "$NODE_NAME" ]; then
|
||||
NODE_NAME="$1"
|
||||
else
|
||||
echo "Unexpected argument: $1"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
echo "ERROR: WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
# Check if node name was provided
|
||||
if [ -z "$NODE_NAME" ]; then
|
||||
print_error "Node name is required"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_header "Wild Cloud Node Setup: $NODE_NAME"
|
||||
|
||||
# =============================================================================
|
||||
# PREREQUISITES
|
||||
# =============================================================================
|
||||
|
||||
# Check if cluster has been initialized
|
||||
NODE_SETUP_DIR="${WC_HOME}/setup/cluster-nodes"
|
||||
if [ ! -f "${NODE_SETUP_DIR}/generated/secrets.yaml" ]; then
|
||||
print_error "Cluster not initialized. Run 'wild-cluster-config-generate' first"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get cluster configuration
|
||||
CLUSTER_NAME=$(wild-config cluster.name)
|
||||
print_info "Cluster: $CLUSTER_NAME"
|
||||
|
||||
# =============================================================================
|
||||
# NODE DETECTION
|
||||
# =============================================================================
|
||||
|
||||
print_info "Detecting node: $NODE_NAME"
|
||||
|
||||
# Get target IP for detection
|
||||
if wild-config --check "cluster.nodes.active.${NODE_NAME}.targetIp"; then
|
||||
TARGET_IP=$(wild-config "cluster.nodes.active.${NODE_NAME}.targetIp")
|
||||
else
|
||||
read -p "Enter target IP address for node $NODE_NAME: " -r TARGET_IP
|
||||
if [ -z "$TARGET_IP" ]; then
|
||||
print_error "IP address is required for node detection"
|
||||
exit 1
|
||||
fi
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
|
||||
fi
|
||||
|
||||
# Try detection at target IP, fallback to current IP if needed
|
||||
if NODE_INFO=$(wild-node-detect "$TARGET_IP" 2>/dev/null); then
|
||||
DETECTION_IP="$TARGET_IP"
|
||||
else
|
||||
read -p "Enter current IP for this node (maintenance mode): " -r CURRENT_IP
|
||||
if [ -z "$CURRENT_IP" ]; then
|
||||
print_error "Current IP is required for maintenance mode detection"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if NODE_INFO=$(wild-node-detect "$CURRENT_IP" 2>/dev/null); then
|
||||
DETECTION_IP="$CURRENT_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$CURRENT_IP"
|
||||
else
|
||||
print_error "Failed to detect node"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Parse node information
|
||||
MAINTENANCE_MODE=$(echo "$NODE_INFO" | jq -r '.maintenance_mode')
|
||||
|
||||
# =============================================================================
|
||||
# NODE CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
if [ "$FORCE_CONFIG" = true ] || \
|
||||
! wild-config --check "cluster.nodes.active.${NODE_NAME}.interface" || \
|
||||
! wild-config --check "cluster.nodes.active.${NODE_NAME}.disk"; then
|
||||
|
||||
print_header "Node Configuration: $NODE_NAME"
|
||||
|
||||
# Parse hardware information and select disk
|
||||
INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface')
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk')
|
||||
|
||||
# Find default disk number
|
||||
DEFAULT_NUM=$(echo "$NODE_INFO" | jq -r --arg disk "$SELECTED_DISK" '.disks | to_entries | map(select(.value.path == $disk)) | .[0].key // empty')
|
||||
DEFAULT_NUM=$((DEFAULT_NUM + 1))
|
||||
|
||||
echo ""
|
||||
echo "Available disks:"
|
||||
echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
|
||||
|
||||
while true; do
|
||||
read -p "Select disk [default: $DEFAULT_NUM]: " -r disk_num
|
||||
|
||||
if [ -z "$disk_num" ]; then
|
||||
disk_num=$DEFAULT_NUM
|
||||
fi
|
||||
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path")
|
||||
if [ "$SELECTED_DISK" != "null" ] && [ -n "$SELECTED_DISK" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Invalid selection. Please enter a number from the list above."
|
||||
done
|
||||
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$TARGET_IP"
|
||||
|
||||
# Set node defaults if not configured
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.role"; then
|
||||
wild-config-set "cluster.nodes.active.${NODE_NAME}.role" "worker"
|
||||
fi
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.version"; then
|
||||
default_version=$(wild-config "cluster.nodes.talos.version")
|
||||
wild-config-set "cluster.nodes.active.${NODE_NAME}.version" "$default_version"
|
||||
fi
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.schematicId"; then
|
||||
default_schematic_id=$(wild-config "cluster.nodes.talos.schematicId")
|
||||
wild-config-set "cluster.nodes.active.${NODE_NAME}.schematicId" "$default_schematic_id"
|
||||
fi
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
# CONFIGURATION GENERATION
|
||||
# =============================================================================
|
||||
|
||||
print_header "Configuration Generation: $NODE_NAME"
|
||||
|
||||
# Get node configuration
|
||||
NODE_ROLE=$(wild-config "cluster.nodes.active.${NODE_NAME}.role")
|
||||
NODE_IP=$(wild-config "cluster.nodes.active.${NODE_NAME}.targetIp")
|
||||
NODE_INTERFACE=$(wild-config "cluster.nodes.active.${NODE_NAME}.interface")
|
||||
NODE_DISK=$(wild-config "cluster.nodes.active.${NODE_NAME}.disk")
|
||||
NODE_VERSION=$(wild-config "cluster.nodes.active.${NODE_NAME}.version")
|
||||
NODE_SCHEMATIC_ID=$(wild-config "cluster.nodes.active.${NODE_NAME}.schematicId")
|
||||
|
||||
print_info "Node configuration:"
|
||||
print_info " - Name: $NODE_NAME"
|
||||
print_info " - Role: $NODE_ROLE"
|
||||
print_info " - IP: $NODE_IP"
|
||||
print_info " - Interface: $NODE_INTERFACE"
|
||||
print_info " - Disk: $NODE_DISK"
|
||||
print_info " - Talos Version: $NODE_VERSION"
|
||||
print_info " - Schematic ID: $NODE_SCHEMATIC_ID"
|
||||
|
||||
# Determine base configuration file
|
||||
if [ "$NODE_ROLE" = "controlplane" ]; then
|
||||
BASE_CONFIG="${NODE_SETUP_DIR}/generated/controlplane.yaml"
|
||||
TEMPLATE_FILE="${WC_ROOT}/setup/cluster-nodes/patch.templates/controlplane.yaml"
|
||||
else
|
||||
BASE_CONFIG="${NODE_SETUP_DIR}/generated/worker.yaml"
|
||||
TEMPLATE_FILE="${WC_ROOT}/setup/cluster-nodes/patch.templates/worker.yaml"
|
||||
fi
|
||||
|
||||
# Check if base config exists
|
||||
if [ ! -f "$BASE_CONFIG" ]; then
|
||||
print_error "Base configuration not found: $BASE_CONFIG"
|
||||
print_info "Run 'wild-cluster-config-generate' first"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Generate patch file
|
||||
print_info "Generating node-specific patch..."
|
||||
mkdir -p "${NODE_SETUP_DIR}/patch"
|
||||
|
||||
PATCH_FILE="${NODE_SETUP_DIR}/patch/${NODE_NAME}.yaml"
|
||||
TEMP_TEMPLATE="/tmp/${NODE_NAME//\//_}-$(date +%s).yaml"
|
||||
|
||||
# Apply variable substitutions to template
|
||||
sed -e "s/{{NODE_NAME}}/${NODE_NAME}/g" \
|
||||
-e "s/{{NODE_IP}}/${NODE_IP}/g" \
|
||||
-e "s/{{SCHEMATIC_ID}}/${NODE_SCHEMATIC_ID}/g" \
|
||||
-e "s/{{VERSION}}/${NODE_VERSION}/g" "$TEMPLATE_FILE" > "$TEMP_TEMPLATE"
|
||||
|
||||
# Process template with gomplate
|
||||
if ! cat "$TEMP_TEMPLATE" | wild-compile-template > "$PATCH_FILE"; then
|
||||
rm -f "$TEMP_TEMPLATE"
|
||||
print_error "Failed to compile patch template for $NODE_NAME"
|
||||
exit 1
|
||||
fi
|
||||
rm -f "$TEMP_TEMPLATE"
|
||||
|
||||
print_success "Generated patch file: $PATCH_FILE"
|
||||
|
||||
# Generate final machine configuration
|
||||
print_info "Generating final machine configuration..."
|
||||
mkdir -p "${NODE_SETUP_DIR}/final"
|
||||
|
||||
CONFIG_FILE="${NODE_SETUP_DIR}/final/${NODE_NAME}.yaml"
|
||||
if ! talosctl machineconfig patch "$BASE_CONFIG" --patch @"$PATCH_FILE" -o "$CONFIG_FILE"; then
|
||||
print_error "Failed to generate final machine configuration"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "Generated final configuration: $CONFIG_FILE"
|
||||
|
||||
# =============================================================================
|
||||
# DEPLOYMENT
|
||||
# =============================================================================
|
||||
|
||||
if [ "$NO_DEPLOY" = true ]; then
|
||||
print_success "Configuration generated (--no-deploy specified)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
print_header "Configuration Deployment: $NODE_NAME"
|
||||
|
||||
# Apply configuration using detected node information
|
||||
TALOSCTL_CMD="talosctl apply-config --nodes $DETECTION_IP --file $CONFIG_FILE"
|
||||
if [ "$MAINTENANCE_MODE" = "true" ]; then
|
||||
TALOSCTL_CMD="$TALOSCTL_CMD --insecure"
|
||||
fi
|
||||
|
||||
if eval "$TALOSCTL_CMD"; then
|
||||
print_success "Configuration applied successfully to $NODE_NAME"
|
||||
else
|
||||
print_error "Failed to apply machine configuration"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_info "Waiting 10 seconds for node to stabilize..."
|
||||
sleep 10
|
||||
|
||||
if talosctl config node "$TARGET_IP"; then
|
||||
print_success "Node setup completed for $NODE_NAME!"
|
||||
else
|
||||
print_error "Node setup failed for $NODE_NAME!"
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
@@ -5,15 +5,10 @@ set -o pipefail
|
||||
|
||||
# Parse arguments
|
||||
|
||||
SKIP_INSTALLER=false
|
||||
SKIP_HARDWARE=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--skip-installer)
|
||||
SKIP_INSTALLER=true
|
||||
shift
|
||||
;;
|
||||
--skip-hardware)
|
||||
SKIP_HARDWARE=true
|
||||
shift
|
||||
@@ -26,7 +21,6 @@ while [[ $# -gt 0 ]]; do
|
||||
echo "Control Options:"
|
||||
echo " --skip-installer Skip Installer image generation"
|
||||
echo " --skip-hardware Skip Node hardware detection"
|
||||
echo " --skip-configs Skip Machine config generation"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "Prerequisites:"
|
||||
@@ -54,7 +48,7 @@ done
|
||||
# Initialize Wild Cloud environment
|
||||
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
print "WC_ROOT is not set."
|
||||
echo "ERROR: WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
@@ -136,310 +130,280 @@ fi
|
||||
# =============================================================================
|
||||
|
||||
if [ "${SKIP_HARDWARE}" = false ]; then
|
||||
|
||||
print_header "Control node registration"
|
||||
print_header "Control Plane Node Setup"
|
||||
|
||||
# Automatically configure the first three IPs after VIP for control plane nodes
|
||||
vip_last_octet=$(echo "$vip" | cut -d. -f4)
|
||||
vip_prefix=$(echo "$vip" | cut -d. -f1-3)
|
||||
|
||||
# Process each control plane node
|
||||
# Set up control plane nodes
|
||||
for i in 1 2 3; do
|
||||
NODE_NAME="${HOSTNAME_PREFIX}control-${i}"
|
||||
TARGET_IP="${vip_prefix}.$(( vip_last_octet + i ))"
|
||||
print_info "Checking for control plane node: $NODE_NAME (IP: $TARGET_IP)"
|
||||
|
||||
if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then
|
||||
print_success "Node $NODE_NAME already registered."
|
||||
continue
|
||||
fi
|
||||
print_info "Setting up control plane node: $NODE_NAME (IP: $TARGET_IP)"
|
||||
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.role"; then
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
|
||||
fi
|
||||
|
||||
if ! wild-config --check "cluster.nodes.active.${NODE_NAME}.targetIp"; then
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
|
||||
fi
|
||||
|
||||
print_info "${NODE_NAME} not found. Please ensure the node is powered on and running Talos in maintenance mode."
|
||||
read -p "Is $NODE_NAME in maintenance mode now? (Y/n): " -r register_node
|
||||
if [[ $register_node =~ ^[Nn]$ ]]; then
|
||||
print_info "Skipping bringing up node $NODE_NAME registration"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Detect node hardware
|
||||
print_info "Attempting detection at target IP $TARGET_IP..."
|
||||
DETECTION_IP="$TARGET_IP"
|
||||
NODE_INFO=""
|
||||
|
||||
if wild-node-detect "$TARGET_IP" >/dev/null 2>&1; then
|
||||
NODE_INFO=$(wild-node-detect "$TARGET_IP")
|
||||
print_success "Node detected at target IP $TARGET_IP"
|
||||
else
|
||||
# Fall back to current IP
|
||||
print_warning "Node not accessible at target IP $TARGET_IP"
|
||||
read -p "Enter current IP for this node: " -r CURRENT_IP
|
||||
|
||||
if [ -z "$CURRENT_IP" ]; then
|
||||
print_warning "Skipping node $NODE_NAME registration"
|
||||
continue
|
||||
fi
|
||||
|
||||
print_info "Attempting detection at current IP $CURRENT_IP..."
|
||||
if wild-node-detect "$CURRENT_IP" >/dev/null 2>&1; then
|
||||
NODE_INFO=$(wild-node-detect "$CURRENT_IP")
|
||||
DETECTION_IP="$CURRENT_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$CURRENT_IP"
|
||||
print_success "Node detected at current IP $CURRENT_IP"
|
||||
else
|
||||
print_error "Failed to detect node at $CURRENT_IP"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! [ -n "$NODE_INFO" ]; then
|
||||
print_error "No hardware information received from node"
|
||||
continue
|
||||
fi
|
||||
|
||||
INTERFACE=$(echo "$NODE_INFO" | jq -r '.interface')
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r '.selected_disk')
|
||||
AVAILABLE_DISKS=$(echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -)
|
||||
|
||||
print_success "Hardware detected:"
|
||||
print_info " - Interface: $INTERFACE"
|
||||
print_info " - Available disks: $AVAILABLE_DISKS"
|
||||
print_info " - Selected disk: $SELECTED_DISK"
|
||||
|
||||
# User system disk selection
|
||||
echo ""
|
||||
read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk
|
||||
if [[ $use_disk =~ ^[Nn]$ ]]; then
|
||||
echo "Available disks:"
|
||||
echo "$NODE_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
|
||||
read -p "Enter disk number: " -r disk_num
|
||||
SELECTED_DISK=$(echo "$NODE_INFO" | jq -r ".disks[$((disk_num-1))].path")
|
||||
if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then
|
||||
print_error "Invalid disk selection"
|
||||
continue
|
||||
fi
|
||||
print_info "Selected disk: $SELECTED_DISK"
|
||||
fi
|
||||
|
||||
# Update config.yaml with hardware info.
|
||||
print_info "Updating configuration for $NODE_NAME..."
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
|
||||
|
||||
# Copy current Talos version and schematic ID to this node
|
||||
# Pre-configure node role and target IP
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "controlplane"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$TARGET_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
||||
|
||||
# The node is now configured. Bring it up.
|
||||
echo ""
|
||||
read -p "Bring node $NODE_NAME ($TARGET_IP) up now? (Y/n): " -r apply_config
|
||||
if [[ ! $apply_config =~ ^[Nn]$ ]]; then
|
||||
if [ "$DETECTION_IP" != "$TARGET_IP" ]; then
|
||||
# Node is in maintenance mode, use insecure flag
|
||||
print_info "Applying configuration in insecure mode (maintenance mode)..."
|
||||
wild-cluster-node-up "$NODE_NAME" --insecure
|
||||
else
|
||||
# Node is already up, no insecure flag needed
|
||||
print_info "Applying configuration..."
|
||||
wild-cluster-node-up "$NODE_NAME" --force
|
||||
fi
|
||||
|
||||
# Bootstrap the cluster after the first node is up.
|
||||
if [ "$i" -eq 1 ]; then
|
||||
read -p "The cluster should be bootstrapped after the first control node is ready. Is it ready? (Y/n): " -r is_ready
|
||||
if [[ ! $is_ready =~ ^[Nn]$ ]]; then
|
||||
print_info "Bootstrapping control plane node $TARGET_IP..."
|
||||
talosctl config endpoint "$TARGET_IP"
|
||||
|
||||
# Attempt to bootstrap the cluster
|
||||
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
|
||||
print_success "Control plane node $TARGET_IP bootstrapped successfully!"
|
||||
else
|
||||
# Check if the error is because it's already bootstrapped
|
||||
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
|
||||
print_info "Cluster is already bootstrapped on $TARGET_IP"
|
||||
else
|
||||
print_error "Failed to bootstrap control plane node $TARGET_IP"
|
||||
print_info "Bootstrap output:"
|
||||
cat /tmp/bootstrap_output.log
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
|
||||
# Wait for VIP to become available before using it
|
||||
print_info "Waiting for VIP $vip to become available..."
|
||||
max_attempts=30
|
||||
attempt=1
|
||||
vip_ready=false
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if ping -c 1 -W 2 "$vip" >/dev/null 2>&1; then
|
||||
# VIP responds to ping, now test Talos API
|
||||
if talosctl -e "$vip" -n "$vip" version >/dev/null 2>&1; then
|
||||
print_success "VIP $vip is ready (attempt $attempt/$max_attempts)"
|
||||
vip_ready=true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
print_info "VIP not ready, waiting... (attempt $attempt/$max_attempts)"
|
||||
sleep 2
|
||||
attempt=$((attempt + 1))
|
||||
done
|
||||
|
||||
if [ "$vip_ready" = true ]; then
|
||||
talosctl config endpoint "$vip"
|
||||
print_info "Talos endpoint set to control plane VIP: $vip"
|
||||
|
||||
if talosctl kubeconfig "$vip"; then
|
||||
print_success "Talos kubeconfig updated for control plane VIP: $vip"
|
||||
else
|
||||
print_error "Failed to get kubeconfig from VIP: $vip"
|
||||
print_info "You can try again later with: talosctl kubeconfig $vip"
|
||||
fi
|
||||
else
|
||||
print_error "VIP $vip did not become available after $max_attempts attempts"
|
||||
print_warning "Falling back to direct node access"
|
||||
print_info "Talos endpoint remains set to: $TARGET_IP"
|
||||
print_info "You can try switching to VIP later with: talosctl config endpoint $vip"
|
||||
fi
|
||||
# Check if node is already configured
|
||||
if wild-config --check "cluster.nodes.active.${NODE_NAME}.interface"; then
|
||||
print_success "Node $NODE_NAME already configured"
|
||||
echo ""
|
||||
read -p "Re-deploy node $NODE_NAME? (y/N): " -r redeploy_node
|
||||
if [[ $redeploy_node =~ ^[Yy]$ ]]; then
|
||||
if ! wild-node-setup "$NODE_NAME"; then
|
||||
print_error "Failed to set up node $NODE_NAME"
|
||||
continue
|
||||
fi
|
||||
else
|
||||
continue
|
||||
fi
|
||||
else
|
||||
# Node needs initial setup
|
||||
print_info "Node $NODE_NAME requires hardware detection and setup"
|
||||
echo ""
|
||||
read -p "Set up node $NODE_NAME now? (Y/n): " -r setup_node
|
||||
if [[ $setup_node =~ ^[Nn]$ ]]; then
|
||||
print_info "Skipping node $NODE_NAME setup"
|
||||
continue
|
||||
fi
|
||||
|
||||
else
|
||||
print_info "Configuration not applied. You can apply it later with:"
|
||||
print_info " wild-cluster-node-up $NODE_NAME --insecure"
|
||||
# Run complete node setup
|
||||
if ! wild-node-setup "$NODE_NAME"; then
|
||||
print_error "Failed to set up node $NODE_NAME"
|
||||
print_info "You can retry later with: wild-node-setup $NODE_NAME"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Bootstrap the cluster after the first node is up
|
||||
if [ "$i" -eq 1 ]; then
|
||||
echo ""
|
||||
read -p "Bootstrap the cluster on $NODE_NAME? (Y/n): " -r bootstrap_cluster
|
||||
if [[ ! $bootstrap_cluster =~ ^[Nn]$ ]]; then
|
||||
print_header "Bootstrapping Cluster: $NODE_NAME"
|
||||
talosctl config endpoint "$TARGET_IP"
|
||||
|
||||
if talosctl bootstrap --nodes "$TARGET_IP" 2>&1 | tee /tmp/bootstrap_output.log; then
|
||||
print_success "Cluster bootstrap initiated successfully."
|
||||
else
|
||||
if grep -q "etcd data directory is not empty\|AlreadyExists" /tmp/bootstrap_output.log; then
|
||||
print_info "Cluster is already bootstrapped."
|
||||
else
|
||||
print_error "Failed to bootstrap cluster."
|
||||
print_info "Bootstrap output:"
|
||||
cat /tmp/bootstrap_output.log
|
||||
rm -f /tmp/bootstrap_output.log
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
mv -f /tmp/bootstrap_output.log /tmp/bootstrap_output_success.log
|
||||
|
||||
# Step 1: Verify etcd cluster health
|
||||
print_info -n "Step 1/6: Verifying etcd cluster health."
|
||||
max_attempts=30
|
||||
for attempt in $(seq 1 $max_attempts); do
|
||||
if talosctl -n "$TARGET_IP" etcd status >/dev/null 2>&1; then
|
||||
echo ""
|
||||
print_success "etcd cluster is healthy."
|
||||
break
|
||||
fi
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
echo ""
|
||||
print_error "etcd cluster not healthy after $max_attempts attempts."
|
||||
print_info "Troubleshooting steps:"
|
||||
print_info " 1. Check etcd service: talosctl -n $TARGET_IP service etcd"
|
||||
print_info " 2. Check etcd logs: talosctl -n $TARGET_IP logs etcd"
|
||||
print_info " 3. Check etcd status details: talosctl -n $TARGET_IP etcd status"
|
||||
print_info " 4. Verify bootstrap completed: talosctl -n $TARGET_IP get members"
|
||||
exit 1
|
||||
fi
|
||||
printf "."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Step 2: Wait for VIP to be assigned to interface
|
||||
print_info -n "Step 2/6: Waiting for VIP $vip to be assigned to interface."
|
||||
max_attempts=90
|
||||
for attempt in $(seq 1 $max_attempts); do
|
||||
if talosctl -n "$TARGET_IP" get addresses | grep -q "$vip/32"; then
|
||||
echo ""
|
||||
print_success "VIP $vip assigned to interface."
|
||||
break
|
||||
fi
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
echo ""
|
||||
print_error "VIP $vip was not assigned to interface after $max_attempts attempts"
|
||||
print_info "Troubleshooting steps:"
|
||||
print_info " 1. Check VIP controller logs: talosctl -n $TARGET_IP logs controller-runtime | grep vip"
|
||||
print_info " 2. Check network configuration: talosctl -n $TARGET_IP get addresses"
|
||||
print_info " 3. Verify VIP is within node's network range"
|
||||
exit 1
|
||||
fi
|
||||
printf "."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Step 3: Wait for control plane components to start
|
||||
print_info -n "Step 3/6: Waiting for control plane components to start."
|
||||
max_attempts=60
|
||||
for attempt in $(seq 1 $max_attempts); do
|
||||
# Check if all three control plane components are running
|
||||
apiserver_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-apiserver.*CONTAINER_RUNNING" || true)
|
||||
controller_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-controller-manager.*CONTAINER_RUNNING" || true)
|
||||
scheduler_running=$(talosctl -n "$TARGET_IP" containers -k | grep -c "kube-scheduler.*CONTAINER_RUNNING" || true)
|
||||
|
||||
if [ "$apiserver_running" -gt 0 ] && [ "$controller_running" -gt 0 ] && [ "$scheduler_running" -gt 0 ]; then
|
||||
echo ""
|
||||
print_success "All control plane components are running (attempt $attempt)."
|
||||
break
|
||||
fi
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
echo ""
|
||||
print_error "Control plane components not all running after $max_attempts attempts."
|
||||
print_info "Troubleshooting steps:"
|
||||
print_info " 1. Check kubelet logs: talosctl -n $TARGET_IP logs kubelet"
|
||||
print_info " 2. Check static pod status: talosctl -n $TARGET_IP containers -k | grep kube-"
|
||||
print_info " 3. Restart kubelet if needed: talosctl -n $TARGET_IP service kubelet restart"
|
||||
print_info "Current status:"
|
||||
print_info " API Server running: $apiserver_running"
|
||||
print_info " Controller Manager running: $controller_running"
|
||||
print_info " Scheduler running: $scheduler_running"
|
||||
exit 1
|
||||
fi
|
||||
# Restart kubelet every 40 attempts to refresh static pod creation
|
||||
if [ $((attempt % 40)) -eq 0 ]; then
|
||||
echo ""
|
||||
print_info "Restarting kubelet to refresh static pod creation (attempt $attempt)..."
|
||||
talosctl -n "$TARGET_IP" service kubelet restart > /dev/null 2>&1
|
||||
print_info -n "Waiting for control plane components after kubelet restart."
|
||||
sleep 30 # Give kubelet time to restart and create pods
|
||||
fi
|
||||
printf "."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Step 4: Wait for API server to respond on VIP
|
||||
print_info -n "Step 4/6: Waiting for API server to respond on VIP $vip."
|
||||
max_attempts=60
|
||||
for attempt in $(seq 1 $max_attempts); do
|
||||
if curl -k -s --max-time 5 "https://$vip:6443/healthz" >/dev/null 2>&1; then
|
||||
echo ""
|
||||
print_success "API server responding on VIP."
|
||||
break
|
||||
fi
|
||||
if [ $attempt -eq $max_attempts ]; then
|
||||
echo ""
|
||||
print_error "API server not responding on VIP $vip after $max_attempts attempts."
|
||||
print_info "Troubleshooting steps:"
|
||||
print_info " 1. Check API server logs: talosctl -n $TARGET_IP logs kubelet | grep apiserver"
|
||||
print_info " 2. Check if API server is running: talosctl -n $TARGET_IP containers -k | grep apiserver"
|
||||
print_info " 3. Test API server on node IP: curl -k https://$TARGET_IP:6443/healthz"
|
||||
exit 1
|
||||
fi
|
||||
# Attempt kubelet restart every 15 attempts to refresh certificates
|
||||
if [ $((attempt % 15)) -eq 0 ]; then
|
||||
echo ""
|
||||
print_info "Restarting kubelet to refresh API container setup (attempt $attempt)..."
|
||||
talosctl -n "$TARGET_IP" service kubelet restart > /dev/null 2>&1
|
||||
print_info -n "Waiting for API server to respond after kubelet restart."
|
||||
sleep 30 # Give kubelet time to restart
|
||||
fi
|
||||
printf "."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Step 5: Configure talosctl endpoint and get kubeconfig
|
||||
print_info "Step 5/6: Configuring cluster access..."
|
||||
talosctl config endpoint "$vip"
|
||||
|
||||
if ! talosctl kubeconfig --nodes "$vip"; then
|
||||
print_error "Failed to get kubeconfig via VIP."
|
||||
print_info "Troubleshooting steps:"
|
||||
print_info " 1. Check API server logs: talosctl -n $TARGET_IP logs kube-apiserver"
|
||||
print_info " 2. Test API server on node IP: curl -k https://$TARGET_IP:6443/healthz"
|
||||
print_info " 3. Verify network connectivity to VIP"
|
||||
exit 1
|
||||
else
|
||||
print_success "Kubeconfig retrieved via VIP."
|
||||
fi
|
||||
|
||||
|
||||
# Step 6: Verify node registration
|
||||
print_info -n "Step 6/6: Verifying node registration."
|
||||
for reg_attempt in $(seq 1 10); do
|
||||
if kubectl get nodes 2>/dev/null | grep -q "Ready\|NotReady"; then
|
||||
echo ""
|
||||
print_success "Node registered with API server."
|
||||
break
|
||||
fi
|
||||
echo -n "."
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if ! kubectl get nodes 2>/dev/null | grep -q "Ready\|NotReady"; then
|
||||
echo ""
|
||||
print_error "Node did not register with API server after multiple attempts."
|
||||
print_info "Troubleshooting steps:"
|
||||
print_info " 1. Check kubelet logs: talosctl -n $TARGET_IP logs kubelet"
|
||||
print_info " 2. Check API server logs: talosctl -n $TARGET_IP logs kube-apiserver"
|
||||
print_info " 3. Verify network connectivity between node and VIP"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "Cluster bootstrap completed!"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# Register worker nodes
|
||||
# Worker node setup
|
||||
echo ""
|
||||
print_info "Configure worker nodes (optional):"
|
||||
print_header "Worker Node Setup (Optional)"
|
||||
WORKER_COUNT=1
|
||||
while true; do
|
||||
echo ""
|
||||
read -p "Do you want to register a worker node? (y/N): " -r register_worker
|
||||
read -p "Set up a worker node? (y/N): " -r setup_worker
|
||||
|
||||
if [[ $register_worker =~ ^[Yy]$ ]]; then
|
||||
# Find first available worker number
|
||||
while [ -n "$(wild-config "cluster.nodes.active.\"${HOSTNAME_PREFIX}worker-${WORKER_COUNT}\".role" 2>/dev/null)" ] && [ "$(wild-config "cluster.nodes.active.\"${HOSTNAME_PREFIX}worker-${WORKER_COUNT}\".role" 2>/dev/null)" != "null" ]; do
|
||||
if [[ $setup_worker =~ ^[Yy]$ ]]; then
|
||||
# Find next available worker number
|
||||
while wild-config --check "cluster.nodes.active.${HOSTNAME_PREFIX}worker-${WORKER_COUNT}.role" 2>/dev/null; do
|
||||
WORKER_COUNT=$((WORKER_COUNT + 1))
|
||||
done
|
||||
|
||||
NODE_NAME="${HOSTNAME_PREFIX}worker-${WORKER_COUNT}"
|
||||
read -p "Enter current IP for worker node $NODE_NAME: " -r WORKER_IP
|
||||
read -p "Enter IP address for worker node $NODE_NAME: " -r WORKER_IP
|
||||
|
||||
if [ -z "$WORKER_IP" ]; then
|
||||
print_warning "No IP provided, skipping worker node"
|
||||
continue
|
||||
fi
|
||||
|
||||
print_info "Running wild-node-detect for worker node $NODE_NAME ($WORKER_IP)..."
|
||||
# Run detection and capture both output and stderr for debugging
|
||||
DETECTION_OUTPUT=$(mktemp)
|
||||
DETECTION_ERROR=$(mktemp)
|
||||
if wild-node-detect "$WORKER_IP" >"$DETECTION_OUTPUT" 2>"$DETECTION_ERROR"; then
|
||||
WORKER_INFO=$(cat "$DETECTION_OUTPUT")
|
||||
print_success "Worker node $NODE_NAME detected at IP $WORKER_IP"
|
||||
rm -f "$DETECTION_OUTPUT" "$DETECTION_ERROR"
|
||||
# Pre-configure worker node
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
||||
|
||||
# Run complete node setup
|
||||
if wild-node-setup "$NODE_NAME"; then
|
||||
print_success "Worker node $NODE_NAME setup completed"
|
||||
WORKER_COUNT=$((WORKER_COUNT + 1))
|
||||
else
|
||||
print_error "Failed to detect hardware for worker node $NODE_NAME ($WORKER_IP)"
|
||||
print_info "Detection error output:"
|
||||
cat "$DETECTION_ERROR" >&2
|
||||
print_info "Make sure the node is running in maintenance mode and accessible"
|
||||
rm -f "$DETECTION_OUTPUT" "$DETECTION_ERROR"
|
||||
continue
|
||||
print_error "Failed to set up worker node $NODE_NAME"
|
||||
print_info "You can retry later with: wild-node-setup $NODE_NAME"
|
||||
fi
|
||||
|
||||
if [ -n "$WORKER_INFO" ]; then
|
||||
# Parse JSON response
|
||||
INTERFACE=$(echo "$WORKER_INFO" | jq -r '.interface')
|
||||
SELECTED_DISK=$(echo "$WORKER_INFO" | jq -r '.selected_disk')
|
||||
AVAILABLE_DISKS=$(echo "$WORKER_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | paste -sd, -)
|
||||
|
||||
print_success "Hardware detected for worker node $NODE_NAME:"
|
||||
print_info " - Interface: $INTERFACE"
|
||||
print_info " - Available disks: $AVAILABLE_DISKS"
|
||||
print_info " - Selected disk: $SELECTED_DISK"
|
||||
|
||||
# Allow user to override disk selection
|
||||
echo ""
|
||||
read -p "Use selected disk '$SELECTED_DISK'? (Y/n): " -r use_disk
|
||||
if [[ $use_disk =~ ^[Nn]$ ]]; then
|
||||
echo "Available disks:"
|
||||
echo "$WORKER_INFO" | jq -r '.disks[] | "\(.path) (\((.size / 1000000000) | floor)GB)"' | nl -w2 -s') '
|
||||
read -p "Enter disk number: " -r disk_num
|
||||
SELECTED_DISK=$(echo "$WORKER_INFO" | jq -r ".disks[$((disk_num-1))].path")
|
||||
if [ "$SELECTED_DISK" = "null" ] || [ -z "$SELECTED_DISK" ]; then
|
||||
print_error "Invalid disk selection"
|
||||
continue
|
||||
fi
|
||||
print_info "Selected disk: $SELECTED_DISK"
|
||||
fi
|
||||
|
||||
# Update config.yaml with worker hardware info
|
||||
print_info "Updating config.yaml for worker node $NODE_NAME..."
|
||||
|
||||
# Store under unified cluster.nodes.active.<node-name>
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".role" "worker"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".currentIp" "$WORKER_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".targetIp" "$WORKER_IP"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".interface" "$INTERFACE"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".disk" "$SELECTED_DISK"
|
||||
|
||||
# Copy current Talos version and schematic ID to this node
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".version" "$talos_version"
|
||||
wild-config-set "cluster.nodes.active.\"${NODE_NAME}\".schematicId" "$schematic_id"
|
||||
|
||||
print_success "Worker node $NODE_NAME registered successfully:"
|
||||
print_info " - Name: $NODE_NAME"
|
||||
print_info " - IP: $WORKER_IP"
|
||||
print_info " - Interface: $INTERFACE"
|
||||
print_info " - Disk: $SELECTED_DISK"
|
||||
|
||||
# Generate machine config immediately
|
||||
print_info "Generating machine configuration for $NODE_NAME..."
|
||||
if wild-cluster-node-patch-generate "$NODE_NAME"; then
|
||||
print_success "Machine configuration generated for $NODE_NAME"
|
||||
|
||||
# Ask if user wants to apply the configuration now
|
||||
echo ""
|
||||
read -p "Apply configuration to worker node $NODE_NAME now? (Y/n): " -r apply_config
|
||||
if [[ $apply_config =~ ^[Yy]$ ]] || [[ -z "$apply_config" ]]; then
|
||||
# Worker nodes are typically in maintenance mode during setup
|
||||
print_info "Applying configuration in insecure mode (maintenance mode)..."
|
||||
wild-cluster-node-up "$NODE_NAME" --insecure
|
||||
else
|
||||
print_info "Configuration not applied. You can apply it later with:"
|
||||
print_info " wild-cluster-node-up $NODE_NAME --insecure"
|
||||
fi
|
||||
else
|
||||
print_warning "Failed to generate machine configuration for $NODE_NAME"
|
||||
fi
|
||||
else
|
||||
print_error "Failed to detect hardware for worker node $NODE_NAME"
|
||||
continue
|
||||
fi
|
||||
|
||||
WORKER_COUNT=$((WORKER_COUNT + 1))
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
print_success "Completed Node hardware detection"
|
||||
echo ""
|
||||
print_success "Node setup phase completed"
|
||||
else
|
||||
print_info "Skipping Node Hardware Detection"
|
||||
print_info "Skipping node setup (--skip-hardware specified)"
|
||||
fi
|
||||
|
||||
# =============================================================================
|
||||
@@ -450,3 +414,15 @@ print_header "Wild Cloud Cluster Setup Complete!"
|
||||
|
||||
print_success "Cluster infrastructure setup completed!"
|
||||
echo ""
|
||||
print_info "Next steps:"
|
||||
echo " 1. Run 'wild-setup-services' to install cluster services"
|
||||
echo " 2. Verify nodes are ready: kubectl get nodes"
|
||||
echo " 3. Check cluster health: wild-health"
|
||||
echo ""
|
||||
print_info "Individual node management:"
|
||||
echo " - Setup additional nodes: wild-node-setup <node-name>"
|
||||
echo " - Re-detect hardware: wild-node-setup <node-name> --detect"
|
||||
echo " - Configuration only: wild-node-setup <node-name> --no-deploy"
|
||||
echo ""
|
||||
|
||||
print_success "Wild Cloud cluster setup completed!"
|
||||
@@ -1,116 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Talos schematic management script
|
||||
# This script manages Talos Image Factory schematics centrally
|
||||
# Usage: wild-talos-schema [--force]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Initialize Wild Cloud environment
|
||||
if [ -z "${WC_ROOT}" ]; then
|
||||
print "WC_ROOT is not set."
|
||||
exit 1
|
||||
else
|
||||
source "${WC_ROOT}/scripts/common.sh"
|
||||
init_wild_env
|
||||
fi
|
||||
|
||||
CONFIG_FILE="${WC_HOME}/config.yaml"
|
||||
FORCE_UPLOAD=false
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--force)
|
||||
FORCE_UPLOAD=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: wild-talos-schema [--force]"
|
||||
echo ""
|
||||
echo "Manages Talos Image Factory schematics centrally."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --force Force re-upload even if schematicId already exists"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "This script:"
|
||||
echo " 1. Reads schematic from config.yaml (.cluster.nodes.talos.schematic)"
|
||||
echo " 2. Uploads it to Image Factory if needed"
|
||||
echo " 3. Stores the schematicId in config.yaml (.cluster.nodes.talos.schematicId)"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
echo "Use --help for usage information"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Managing Talos schematic for wildcloud..."
|
||||
|
||||
# Check if schematic exists in config.yaml
|
||||
if ! yq eval '.cluster.nodes.talos.schematic' "$CONFIG_FILE" | grep -v "null" >/dev/null 2>&1; then
|
||||
echo "Error: No schematic found in config.yaml at .cluster.nodes.talos.schematic"
|
||||
echo "Expected schematic configuration with systemExtensions"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if schematicId already exists (unless force)
|
||||
EXISTING_ID=$(yq eval '.cluster.nodes.talos.schematicId // ""' "$CONFIG_FILE")
|
||||
if [ -n "$EXISTING_ID" ] && [ "$FORCE_UPLOAD" = false ]; then
|
||||
echo "✅ Schematic ID already exists: $EXISTING_ID"
|
||||
echo "Use --force to re-upload and generate a new ID"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Extracting schematic from config.yaml..."
|
||||
|
||||
# Create temporary schematic file
|
||||
TEMP_SCHEMATIC=$(mktemp)
|
||||
trap "rm -f $TEMP_SCHEMATIC" EXIT
|
||||
|
||||
# Extract schematic from config.yaml
|
||||
yq eval '.cluster.nodes.talos.schematic' "$CONFIG_FILE" > "$TEMP_SCHEMATIC"
|
||||
|
||||
echo "Schematic contents:"
|
||||
cat "$TEMP_SCHEMATIC"
|
||||
echo ""
|
||||
|
||||
# Upload schematic to Image Factory
|
||||
echo "Uploading schematic to Talos Image Factory..."
|
||||
SCHEMATIC_RESPONSE=$(curl -s -X POST --data-binary @"$TEMP_SCHEMATIC" https://factory.talos.dev/schematics)
|
||||
|
||||
if [ -z "$SCHEMATIC_RESPONSE" ]; then
|
||||
echo "Error: Failed to upload schematic to Image Factory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse schematic ID from JSON response
|
||||
SCHEMATIC_ID=$(echo "$SCHEMATIC_RESPONSE" | sed 's/.*"id":"\([^"]*\)".*/\1/')
|
||||
|
||||
if [ -z "$SCHEMATIC_ID" ] || [ "$SCHEMATIC_ID" = "$SCHEMATIC_RESPONSE" ]; then
|
||||
echo "Error: Failed to parse schematic ID from response: $SCHEMATIC_RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Schematic uploaded successfully!"
|
||||
echo "Schematic ID: $SCHEMATIC_ID"
|
||||
|
||||
# Update config.yaml with schematic ID
|
||||
echo "Updating config.yaml with schematic ID..."
|
||||
yq eval ".cluster.nodes.talos.schematicId = \"$SCHEMATIC_ID\"" -i "$CONFIG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "🎉 Schematic management complete!"
|
||||
echo ""
|
||||
echo "Schematic ID: $SCHEMATIC_ID"
|
||||
echo "Saved to: config.yaml (.cluster.nodes.talos.schematicId)"
|
||||
echo ""
|
||||
echo "This schematic includes:"
|
||||
yq eval '.cluster.nodes.talos.schematic.customization.systemExtensions.officialExtensions[]' "$CONFIG_FILE" | sed 's/^/ - /'
|
||||
echo ""
|
||||
echo "Other scripts can now use this schematicId:"
|
||||
echo " - setup/dnsmasq/bin/create-setup-bundle.sh (PXE boot assets)"
|
||||
echo " - setup/cluster-nodes/create-installer-image.sh (custom installer)"
|
||||
@@ -166,22 +166,13 @@ Shows:
|
||||
- Installation status
|
||||
- Required configuration
|
||||
|
||||
### 2. Fetching Phase
|
||||
**Command**: `wild-app-fetch <app-name>`
|
||||
|
||||
Downloads app templates to local cache:
|
||||
- Copies app directory from Wild Cloud repository
|
||||
- Stores in `.wildcloud/cache/apps/`
|
||||
- Options: `--update` to overwrite existing cache
|
||||
- Required before configuration or deployment
|
||||
|
||||
### 3. Configuration Phase
|
||||
### 2. Configuration Phase
|
||||
**Command**: `wild-app-add <app-name>`
|
||||
|
||||
Processes app templates and prepares for deployment:
|
||||
|
||||
**What it does**:
|
||||
1. Reads app manifest from cache
|
||||
1. Reads app manifest directly from Wild Cloud repository
|
||||
2. Merges default configuration with existing `config.yaml`
|
||||
3. Generates required secrets automatically
|
||||
4. Compiles templates with gomplate using your configuration
|
||||
@@ -193,7 +184,7 @@ Processes app templates and prepares for deployment:
|
||||
- App-specific configuration merged into your `config.yaml`
|
||||
- Required secrets added to your `secrets.yaml`
|
||||
|
||||
### 4. Deployment Phase
|
||||
### 3. Deployment Phase
|
||||
**Command**: `wild-app-deploy <app-name>`
|
||||
|
||||
Deploys the app to your Kubernetes cluster:
|
||||
@@ -210,7 +201,7 @@ Deploys the app to your Kubernetes cluster:
|
||||
- `--force` - Overwrite existing resources
|
||||
- `--dry-run` - Preview changes without applying
|
||||
|
||||
### 5. Operations Phase
|
||||
### 4. Operations Phase
|
||||
|
||||
**Monitoring**: `wild-app-doctor <app-name>`
|
||||
- Runs app-specific diagnostic tests
|
||||
@@ -218,6 +209,7 @@ Deploys the app to your Kubernetes cluster:
|
||||
- Options: `--keep`, `--follow`, `--timeout`
|
||||
|
||||
**Updates**: Re-run `wild-app-add` then `wild-app-deploy`
|
||||
- Use `--force` flag to overwrite existing configuration
|
||||
- Updates configuration changes
|
||||
- Handles image updates
|
||||
- Preserves persistent data
|
||||
|
||||
@@ -56,9 +56,9 @@ Wild Cloud provides 34+ command-line tools (all prefixed with `wild-`) for manag
|
||||
### 🏗️ Cluster Infrastructure Management
|
||||
|
||||
**`wild-setup-cluster`** - Complete cluster setup (Phases 1-3)
|
||||
- Interactive node registration and hardware detection
|
||||
- Configures Talos control plane and worker nodes
|
||||
- Options: `--skip-installer`, `--skip-hardware`
|
||||
- Automated control plane node setup and bootstrapping
|
||||
- Configures Talos control plane nodes using wild-node-setup
|
||||
- Options: `--skip-hardware`
|
||||
- **Usage**: `wild-setup-cluster [options]`
|
||||
- **Requires**: `wild-init` completed first
|
||||
|
||||
@@ -67,20 +67,21 @@ Wild Cloud provides 34+ command-line tools (all prefixed with `wild-`) for manag
|
||||
- Generates cluster secrets using `talosctl gen config`
|
||||
- **Usage**: `wild-cluster-config-generate`
|
||||
|
||||
**`wild-node-detect`** - Hardware detection for nodes
|
||||
**`wild-node-setup`** - Complete node lifecycle management
|
||||
- Handles detect → configure → patch → deploy for individual nodes
|
||||
- Automatically detects maintenance mode and handles IP transitions
|
||||
- Options: `--reconfigure`, `--no-deploy`
|
||||
- **Usage**: `wild-node-setup <node-name> [options]`
|
||||
- **Examples**:
|
||||
- `wild-node-setup control-1` (complete setup)
|
||||
- `wild-node-setup worker-1 --reconfigure` (force node reconfiguration)
|
||||
- `wild-node-setup control-2 --no-deploy` (configuration only)
|
||||
|
||||
**`wild-node-detect`** - Hardware detection utility
|
||||
- Discovers network interfaces and disks from maintenance mode
|
||||
- Returns JSON with hardware specifications
|
||||
- Returns JSON with hardware specifications and maintenance mode status
|
||||
- **Usage**: `wild-node-detect <node-ip>`
|
||||
|
||||
**`wild-cluster-node-patch-generate`** - Generate node-specific configs
|
||||
- Creates patches based on hardware detection
|
||||
- Uses templates with node-specific variables
|
||||
- **Usage**: `wild-cluster-node-patch-generate <node-name>`
|
||||
|
||||
**`wild-cluster-node-up`** - Apply Talos configuration to nodes
|
||||
- Options: `--insecure`, `--force`, `--dry-run`
|
||||
- Generates final config from base + patch
|
||||
- **Usage**: `wild-cluster-node-up <node-name> [options]`
|
||||
- **Note**: Primarily used internally by `wild-node-setup`
|
||||
|
||||
**`wild-cluster-node-ip`** - Get node IP addresses
|
||||
- Sources: config.yaml, kubectl, or talosctl
|
||||
@@ -89,8 +90,8 @@ Wild Cloud provides 34+ command-line tools (all prefixed with `wild-`) for manag
|
||||
|
||||
### 🔧 Cluster Services Management
|
||||
|
||||
**`wild-setup-services`** - Install cluster services (Phase 4)
|
||||
- Manages MetalLB, Traefik, cert-manager, etc.
|
||||
**`wild-setup-services`** - Set up all cluster services (Phase 4)
|
||||
- Manages MetalLB, Traefik, cert-manager, etc. in dependency order
|
||||
- Options: `--fetch` for fresh templates, `--no-deploy` for config-only
|
||||
- **Usage**: `wild-setup-services [options]`
|
||||
- **Requires**: Working Kubernetes cluster
|
||||
@@ -119,15 +120,11 @@ Wild Cloud provides 34+ command-line tools (all prefixed with `wild-`) for manag
|
||||
- Options: `--verbose`, `--json`, `--yaml`
|
||||
- **Usage**: `wild-apps-list [options]`
|
||||
|
||||
**`wild-app-fetch`** - Download app templates to cache
|
||||
- Options: `--update` to overwrite existing
|
||||
- **Usage**: `wild-app-fetch <app-name> [--update]`
|
||||
|
||||
**`wild-app-add`** - Configure app from cache
|
||||
**`wild-app-add`** - Configure app from repository
|
||||
- Processes manifest.yaml with configuration
|
||||
- Generates required secrets automatically
|
||||
- Options: `--update` to overwrite existing app files
|
||||
- **Usage**: `wild-app-add <app-name> [--update]`
|
||||
- Options: `--force` to overwrite existing app files
|
||||
- **Usage**: `wild-app-add <app-name> [--force]`
|
||||
|
||||
**`wild-app-deploy`** - Deploy application to cluster
|
||||
- Creates namespaces, handles dependencies
|
||||
@@ -165,8 +162,10 @@ Wild Cloud provides 34+ command-line tools (all prefixed with `wild-`) for manag
|
||||
|
||||
### 🔍 Utilities & Helpers
|
||||
|
||||
**`wild-health`** - System health checks
|
||||
- Basic health monitoring for components
|
||||
**`wild-health`** - Comprehensive infrastructure validation
|
||||
- Validates core components (MetalLB, Traefik, CoreDNS)
|
||||
- Checks installed services (cert-manager, ExternalDNS, Kubernetes Dashboard)
|
||||
- Tests DNS resolution, routing, certificates, and storage systems
|
||||
- **Usage**: `wild-health`
|
||||
|
||||
**`wild-talos-schema`** - Talos schema management
|
||||
@@ -211,7 +210,6 @@ wild-setup-services # Resume full setup if needed
|
||||
### Application Management
|
||||
```bash
|
||||
wild-apps-list # See available apps
|
||||
wild-app-fetch ghost # Download app templates
|
||||
wild-app-add ghost # Configure app
|
||||
wild-app-deploy ghost # Deploy to cluster
|
||||
wild-app-doctor ghost # Troubleshoot issues
|
||||
@@ -251,14 +249,14 @@ wild-health # Check system health
|
||||
|
||||
### App Deployment Pipeline
|
||||
1. `wild-apps-list` → discover applications
|
||||
2. `wild-app-fetch` → cache app templates
|
||||
3. `wild-app-add` → prepare configuration
|
||||
4. `wild-app-deploy` → deploy to cluster
|
||||
2. `wild-app-add` → configure and prepare application
|
||||
3. `wild-app-deploy` → deploy to cluster
|
||||
|
||||
### Node Management Flow
|
||||
1. `wild-cluster-config-generate` → base configurations
|
||||
2. `wild-node-detect` → discover hardware
|
||||
3. `wild-cluster-node-patch-generate` → node-specific configs
|
||||
4. `wild-cluster-node-up` → apply configurations
|
||||
2. `wild-node-setup <node-name>` → atomic node operations (detect → patch → deploy)
|
||||
- Internally uses `wild-node-detect` for hardware discovery
|
||||
- Generates node-specific patches and final configurations
|
||||
- Deploys configuration to target node
|
||||
|
||||
All scripts are designed to work together as a cohesive Infrastructure as Code system for personal Kubernetes deployments.
|
||||
@@ -153,7 +153,6 @@ wild-setup-services # Install core services
|
||||
wild-apps-list
|
||||
|
||||
# Deploy a blog
|
||||
wild-app-fetch ghost
|
||||
wild-app-add ghost
|
||||
wild-app-deploy ghost
|
||||
|
||||
@@ -215,10 +214,9 @@ storage: 10Gi
|
||||
### Application Lifecycle
|
||||
|
||||
1. **Discovery**: `wild-apps-list` - Browse available apps
|
||||
2. **Fetching**: `wild-app-fetch app-name` - Download templates
|
||||
3. **Configuration**: `wild-app-add app-name` - Process and configure
|
||||
4. **Deployment**: `wild-app-deploy app-name` - Deploy to cluster
|
||||
5. **Operations**: `wild-app-doctor app-name` - Monitor and troubleshoot
|
||||
2. **Configuration**: `wild-app-add app-name` - Configure and prepare application
|
||||
3. **Deployment**: `wild-app-deploy app-name` - Deploy to cluster
|
||||
4. **Operations**: `wild-app-doctor app-name` - Monitor and troubleshoot
|
||||
|
||||
## Available Applications
|
||||
|
||||
@@ -259,7 +257,6 @@ wild-setup-services # Deploy cluster services only
|
||||
### Application Management
|
||||
```bash
|
||||
wild-apps-list # List available applications
|
||||
wild-app-fetch <app> # Download app templates
|
||||
wild-app-add <app> # Configure application
|
||||
wild-app-deploy <app> # Deploy to cluster
|
||||
wild-app-delete <app> # Remove application
|
||||
@@ -317,7 +314,6 @@ wild-app-backup <app> # Backup specific application
|
||||
### Personal Blog/Website
|
||||
```bash
|
||||
# Deploy Ghost blog with custom domain
|
||||
wild-app-fetch ghost
|
||||
wild-config-set apps.ghost.domain "blog.yourdomain.com"
|
||||
wild-app-add ghost
|
||||
wild-app-deploy ghost
|
||||
@@ -326,7 +322,6 @@ wild-app-deploy ghost
|
||||
### Photo Management
|
||||
```bash
|
||||
# Deploy Immich for photo backup and management
|
||||
wild-app-fetch immich postgresql
|
||||
wild-app-add postgresql immich
|
||||
wild-app-deploy postgresql immich
|
||||
```
|
||||
@@ -334,7 +329,6 @@ wild-app-deploy postgresql immich
|
||||
### Development Environment
|
||||
```bash
|
||||
# Set up Git hosting and container registry
|
||||
wild-app-fetch gitea docker-registry
|
||||
wild-app-add gitea docker-registry
|
||||
wild-app-deploy gitea docker-registry
|
||||
```
|
||||
@@ -342,7 +336,6 @@ wild-app-deploy gitea docker-registry
|
||||
### AI/ML Workloads
|
||||
```bash
|
||||
# Deploy vLLM for local AI inference
|
||||
wild-app-fetch vllm
|
||||
wild-config-set apps.vllm.model "Qwen/Qwen2.5-7B-Instruct"
|
||||
wild-app-add vllm
|
||||
wild-app-deploy vllm
|
||||
|
||||
@@ -227,11 +227,10 @@ cluster:
|
||||
### From Repository to Deployment
|
||||
|
||||
1. **Template Storage**: Templates stored in repository with placeholder variables
|
||||
2. **Template Fetching**: `wild-app-fetch` copies templates to user cache
|
||||
3. **Configuration Merge**: `wild-app-add` merges app defaults with user config
|
||||
4. **Template Compilation**: gomplate processes templates with user data
|
||||
5. **Manifest Generation**: Final Kubernetes manifests created in user directory
|
||||
6. **Deployment**: `wild-app-deploy` applies manifests to cluster
|
||||
2. **Configuration Merge**: `wild-app-add` reads templates directly from repository and merges app defaults with user config
|
||||
3. **Template Compilation**: gomplate processes templates with user data
|
||||
4. **Manifest Generation**: Final Kubernetes manifests created in user directory
|
||||
5. **Deployment**: `wild-app-deploy` applies manifests to cluster
|
||||
|
||||
### Template Variables
|
||||
|
||||
@@ -344,7 +343,6 @@ wild-setup # Deploy infrastructure
|
||||
**Daily Operations**:
|
||||
```bash
|
||||
wild-apps-list # Browse available apps
|
||||
wild-app-fetch ghost # Download app templates
|
||||
wild-app-add ghost # Configure app
|
||||
wild-app-deploy ghost # Deploy to cluster
|
||||
```
|
||||
|
||||
@@ -86,30 +86,22 @@ network:
|
||||
- Creates cluster secrets using `talosctl gen config`
|
||||
- Establishes foundation for all node configurations
|
||||
|
||||
#### 2. Hardware Detection
|
||||
**Script**: `wild-node-detect`
|
||||
#### 2. Node Setup (Atomic Operations)
|
||||
**Script**: `wild-node-setup <node-name> [options]`
|
||||
|
||||
Interactive process for each node:
|
||||
- Boots nodes into maintenance mode via PXE
|
||||
- Detects network interfaces and storage devices
|
||||
- Returns JSON specification of hardware capabilities
|
||||
- Records node-specific configuration data
|
||||
**Complete Node Lifecycle Management**:
|
||||
- **Hardware Detection**: Discovers network interfaces and storage devices
|
||||
- **Configuration Generation**: Creates node-specific patches and final configs
|
||||
- **Deployment**: Applies Talos configuration to the node
|
||||
|
||||
#### 3. Node-Specific Configuration
|
||||
**Script**: `wild-cluster-node-patch-generate`
|
||||
**Options**:
|
||||
- `--detect`: Force hardware re-detection
|
||||
- `--no-deploy`: Generate configuration only, skip deployment
|
||||
|
||||
- Generates patches for individual nodes
|
||||
- Uses templates with detected hardware specifications
|
||||
- Creates node-specific machine configurations
|
||||
- Handles IP addresses, interfaces, and disk layout
|
||||
|
||||
#### 4. Node Deployment
|
||||
**Script**: `wild-cluster-node-up`
|
||||
|
||||
- Applies Talos configurations to nodes
|
||||
- Supports `--insecure` for maintenance mode
|
||||
- Generates final configs from base + patches
|
||||
- Deploys both control plane and worker nodes
|
||||
**Integration with Cluster Setup**:
|
||||
- `wild-setup-cluster` automatically calls `wild-node-setup` for each node
|
||||
- Individual node failures don't break cluster setup
|
||||
- Clear retry instructions for failed nodes
|
||||
|
||||
### Cluster Architecture
|
||||
|
||||
@@ -363,8 +355,9 @@ wild-setup-services # Cluster services only
|
||||
### Individual Operations
|
||||
```bash
|
||||
wild-cluster-config-generate # Generate base configs
|
||||
wild-node-detect <ip> # Hardware detection
|
||||
wild-cluster-node-up <node> # Deploy single node
|
||||
wild-node-setup <node-name> # Complete node setup (detect → configure → deploy)
|
||||
wild-node-setup <node-name> --detect # Force hardware re-detection
|
||||
wild-node-setup <node-name> --no-deploy # Configuration only
|
||||
wild-dashboard-token # Get dashboard access
|
||||
wild-health # System health check
|
||||
```
|
||||
|
||||
@@ -220,8 +220,7 @@ This approach prevents naming conflicts between apps and makes secret keys more
|
||||
Apps in Wild Cloud are managed by operators using a set of commands run from their Wild Cloud home directory.
|
||||
|
||||
- `wild-apps-list`: Lists all available apps.
|
||||
- `wild-app-fetch <app-name>`: Fetches the latest app files from the Wild Cloud repository and stores them in your Wild Cloud cache.
|
||||
- `wild-app-add <app-name>`: Adds the app manifest to your Wild Cloud home `apps` directory, updates missing values in `config.yaml` and `secrets.yaml` with the app's default configurations, and compiles the app's Kustomize files.
|
||||
- `wild-app-add <app-name>`: Reads the app from the Wild Cloud repository, adds the app manifest to your Wild Cloud home `apps` directory, updates missing values in `config.yaml` and `secrets.yaml` with the app's default configurations, and compiles the app's Kustomize files.
|
||||
- `wild-app-deploy <app-name>`: Deploys the app to your Wild Cloud.
|
||||
|
||||
## Contributing
|
||||
|
||||
@@ -14,16 +14,10 @@ To list all available apps:
|
||||
wild-apps-list
|
||||
```
|
||||
|
||||
To fetch an app template to cache:
|
||||
To configure an app (reads directly from repository):
|
||||
|
||||
```bash
|
||||
wild-app-fetch <app>
|
||||
```
|
||||
|
||||
To apply your configuration to a cached app (automatically fetches if not cached):
|
||||
|
||||
```bash
|
||||
wild-app-config <app>
|
||||
wild-app-add <app>
|
||||
```
|
||||
|
||||
To deploy a configured app to Kubernetes:
|
||||
|
||||
@@ -11,9 +11,9 @@ import (
|
||||
// Config represents the main configuration structure
|
||||
type Config struct {
|
||||
Wildcloud struct {
|
||||
Repository string `yaml:"repository" json:"repository"`
|
||||
CurrentPhase string `yaml:"currentPhase" json:"currentPhase"`
|
||||
CompletedPhases []string `yaml:"completedPhases" json:"completedPhases"`
|
||||
Repository string `yaml:"repository" json:"repository"`
|
||||
CurrentPhase string `yaml:"currentPhase" json:"currentPhase"`
|
||||
CompletedPhases []string `yaml:"completedPhases" json:"completedPhases"`
|
||||
} `yaml:"wildcloud" json:"wildcloud"`
|
||||
Server struct {
|
||||
Port int `yaml:"port" json:"port"`
|
||||
@@ -89,6 +89,6 @@ func (c *Config) IsEmpty() bool {
|
||||
|
||||
// Check if any essential fields are empty
|
||||
return c.Cloud.Domain == "" ||
|
||||
c.Cloud.DNS.IP == "" ||
|
||||
c.Cluster.Nodes.Talos.Version == ""
|
||||
c.Cloud.DNS.IP == "" ||
|
||||
c.Cluster.Nodes.Talos.Version == ""
|
||||
}
|
||||
@@ -41,23 +41,39 @@ NC='\033[0m' # No Color
|
||||
|
||||
# Print functions for consistent output formatting
|
||||
print_header() {
|
||||
echo -e "\n${BLUE} $1 ===${NC}\n"
|
||||
echo -e "\n${BLUE}=== $1 ===${NC}\n"
|
||||
}
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}ℹ️ ${NC} $1"
|
||||
if [ "$1" = "-n" ]; then
|
||||
echo -ne "${BLUE}ℹ️ ${NC} $2"
|
||||
else
|
||||
echo -e "${BLUE}ℹ️ ${NC} $1"
|
||||
fi
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}⚠️ ${NC} $1"
|
||||
if [ "$1" = "-n" ]; then
|
||||
echo -ne "${YELLOW}⚠️ ${NC} $2"
|
||||
else
|
||||
echo -e "${YELLOW}⚠️ ${NC} $1"
|
||||
fi
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}✅ ${NC} $1"
|
||||
if [ "$1" = "-n" ]; then
|
||||
echo -ne "${GREEN}✅${NC} $2"
|
||||
else
|
||||
echo -e "${GREEN}✅${NC} $1"
|
||||
fi
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}❌ ${NC} $1"
|
||||
if [ "$1" = "-n" ]; then
|
||||
echo -ne "${RED}❌${NC} $2"
|
||||
else
|
||||
echo -e "${RED}❌${NC} $1"
|
||||
fi
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
ARCH=arm64
|
||||
|
||||
# Install kubectl
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo "Error: kubectl is not installed. Installing."
|
||||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256"
|
||||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl"
|
||||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl.sha256"
|
||||
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
|
||||
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
|
||||
fi
|
||||
@@ -13,7 +15,7 @@ fi
|
||||
if command -v gomplate &> /dev/null; then
|
||||
echo "gomplate is already installed."
|
||||
else
|
||||
curl -sSL https://github.com/hairyhenderson/gomplate/releases/latest/download/gomplate_linux-amd64 -o $HOME/.local/bin/gomplate
|
||||
curl -sSL https://github.com/hairyhenderson/gomplate/releases/latest/download/gomplate_linux-${ARCH} -o $HOME/.local/bin/gomplate
|
||||
chmod +x $HOME/.local/bin/gomplate
|
||||
echo "gomplate installed successfully."
|
||||
fi
|
||||
@@ -32,7 +34,7 @@ if command -v yq &> /dev/null; then
|
||||
echo "yq is already installed."
|
||||
else
|
||||
VERSION=v4.45.4
|
||||
BINARY=yq_linux_amd64
|
||||
BINARY=yq_linux_${ARCH}
|
||||
wget https://github.com/mikefarah/yq/releases/download/${VERSION}/${BINARY}.tar.gz -O - | tar xz
|
||||
mv ${BINARY} $HOME/.local/bin/yq
|
||||
chmod +x $HOME/.local/bin/yq
|
||||
|
||||
@@ -4,8 +4,9 @@ print_info "Collecting cert-manager configuration..."
|
||||
|
||||
prompt_if_unset_config "cloud.domain" "Enter main domain name" "example.com"
|
||||
domain=$(wild-config "cloud.domain")
|
||||
baseDomain=$(wild-config "cloud.baseDomain")
|
||||
prompt_if_unset_config "cloud.internalDomain" "Enter internal domain name" "local.${domain}"
|
||||
prompt_if_unset_config "operator.email" "Enter operator email address (for Let's Encrypt)" ""
|
||||
prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain (for DNS challenges)" "${domain}"
|
||||
prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID (for DNS challenges - improves reliability)" ""
|
||||
prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token (for DNS challenges)" ""
|
||||
prompt_if_unset_config "cluster.certManager.cloudflare.domain" "Enter Cloudflare domain" "${baseDomain}"
|
||||
prompt_if_unset_config "cluster.certManager.cloudflare.zoneID" "Enter Cloudflare zone ID" ""
|
||||
prompt_if_unset_secret "cloudflare.token" "Enter Cloudflare API token" ""
|
||||
|
||||
@@ -16,6 +16,10 @@ CERT_MANAGER_DIR="${CLUSTER_SETUP_DIR}/cert-manager"
|
||||
|
||||
print_header "Setting up cert-manager"
|
||||
|
||||
#######################
|
||||
# # Dependencies
|
||||
#######################
|
||||
|
||||
# Check Traefik dependency
|
||||
print_info "Verifying Traefik is ready (required for cert-manager)..."
|
||||
kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=60s 2>/dev/null || {
|
||||
@@ -23,45 +27,36 @@ kubectl wait --for=condition=Available deployment/traefik -n traefik --timeout=6
|
||||
print_info "Note: cert-manager may not work properly without Traefik"
|
||||
}
|
||||
|
||||
# Templates should already be compiled by wild-cluster-services-configure
|
||||
print_info "Using pre-compiled cert-manager templates..."
|
||||
if [ ! -d "${CERT_MANAGER_DIR}/kustomize" ]; then
|
||||
print_error "Compiled templates not found. Run 'wild-cluster-services-configure' first."
|
||||
print_error "Compiled templates not found. This script should not be run directly. Run with 'wild setup cluster-services cert-manager' instead."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_info "Setting up cert-manager..."
|
||||
# Validate DNS resolution using temporary test pod
|
||||
print_info "Validating DNS resolution for ACME challenges..."
|
||||
domain=$(wild-config cluster.certManager.cloudflare.domain)
|
||||
print_info "Testing DNS resolution for domain: $domain"
|
||||
|
||||
# Install cert-manager using the official installation method
|
||||
# This installs CRDs, controllers, and webhook components
|
||||
print_info "Installing cert-manager components..."
|
||||
# Using stable URL for cert-manager installation
|
||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
|
||||
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
|
||||
|
||||
# Wait for cert-manager to be ready
|
||||
print_info "Waiting for cert-manager to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
|
||||
|
||||
# Ensure webhook is fully operational
|
||||
print_info "Verifying cert-manager webhook is fully operational..."
|
||||
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
|
||||
print_info "Waiting for cert-manager webhook to register..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Test webhook connectivity before proceeding
|
||||
print_info "Testing webhook connectivity..."
|
||||
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager
|
||||
# Create temporary pod with DNS utilities (in default namespace since cert-manager doesn't exist yet)
|
||||
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -- \
|
||||
nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
|
||||
print_success "DNS resolution working for ACME challenges" || \
|
||||
print_warning "DNS resolution issues may affect ACME challenges"
|
||||
|
||||
|
||||
# Setup Cloudflare API token for DNS01 challenges
|
||||
print_info "Creating Cloudflare API token secret..."
|
||||
########################
|
||||
# Cloudflare DNS setup
|
||||
########################
|
||||
|
||||
# API token secret setup
|
||||
print_info "Reading Cloudflare API token secret..."
|
||||
CLOUDFLARE_API_TOKEN=$(wild-secret cloudflare.token) || exit 1
|
||||
if [ -z "$CLOUDFLARE_API_TOKEN" ]; then
|
||||
print_error "Cloudflare API token not found. Please create it with 'wild secret create cloudflare.token'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate Cloudflare API token permissions
|
||||
# Validate token
|
||||
print_info "Validating Cloudflare API token permissions..."
|
||||
validate_cloudflare_token() {
|
||||
local token="$1"
|
||||
@@ -91,11 +86,40 @@ validate_cloudflare_token "$CLOUDFLARE_API_TOKEN" || {
|
||||
print_info "Required permissions: Zone - Zone - Read, Zone - DNS - Edit"
|
||||
exit 1
|
||||
}
|
||||
|
||||
########################
|
||||
# Kubernetes components
|
||||
########################
|
||||
|
||||
print_info "Installing cert-manager components..."
|
||||
# Using stable URL for cert-manager installation
|
||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.1/cert-manager.yaml || \
|
||||
kubectl apply -f https://github.com/jetstack/cert-manager/releases/download/v1.13.1/cert-manager.yaml
|
||||
|
||||
# Wait for cert-manager to be ready
|
||||
print_info "Waiting for cert-manager to be ready..."
|
||||
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-cainjector -n cert-manager --timeout=120s
|
||||
kubectl wait --for=condition=Available deployment/cert-manager-webhook -n cert-manager --timeout=120s
|
||||
|
||||
# Now that cert-manager namespace exists, create the Cloudflare API token secret
|
||||
print_info "Creating Cloudflare API token secret..."
|
||||
kubectl create secret generic cloudflare-api-token \
|
||||
--namespace cert-manager \
|
||||
--from-literal=api-token="${CLOUDFLARE_API_TOKEN}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Ensure webhook is fully operational
|
||||
print_info "Verifying cert-manager webhook is fully operational..."
|
||||
until kubectl get validatingwebhookconfigurations cert-manager-webhook &>/dev/null; do
|
||||
print_info "Waiting for cert-manager webhook to register..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Test webhook connectivity before proceeding
|
||||
print_info "Testing webhook connectivity..."
|
||||
kubectl auth can-i create certificates.cert-manager.io --as=system:serviceaccount:cert-manager:cert-manager
|
||||
|
||||
# Configure cert-manager to use external DNS for challenge verification
|
||||
print_info "Configuring cert-manager to use external DNS servers..."
|
||||
kubectl patch deployment cert-manager -n cert-manager --patch '
|
||||
@@ -119,6 +143,10 @@ spec:
|
||||
print_info "Waiting for cert-manager to restart with new DNS configuration..."
|
||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||
|
||||
########################
|
||||
# Create issuers and certificates
|
||||
########################
|
||||
|
||||
# Apply Let's Encrypt issuers and certificates using kustomize
|
||||
print_info "Creating Let's Encrypt issuers and certificates..."
|
||||
kubectl apply -k ${CERT_MANAGER_DIR}/kustomize
|
||||
@@ -128,27 +156,217 @@ print_info "Waiting for Let's Encrypt issuers to be ready..."
|
||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-prod --timeout=60s || print_warning "Production issuer not ready, proceeding anyway..."
|
||||
kubectl wait --for=condition=Ready clusterissuer/letsencrypt-staging --timeout=60s || print_warning "Staging issuer not ready, proceeding anyway..."
|
||||
|
||||
# Validate DNS resolution using temporary test pod
|
||||
print_info "Validating DNS resolution for ACME challenges..."
|
||||
domain=$(wild-config cluster.certManager.cloudflare.domain)
|
||||
print_info "Testing DNS resolution for domain: $domain"
|
||||
# Give cert-manager a moment to process the certificates
|
||||
sleep 5
|
||||
|
||||
# Create temporary pod with DNS utilities
|
||||
kubectl run dns-test --image=busybox:1.35 --rm -i --restart=Never -n cert-manager -- \
|
||||
nslookup -type=SOA "$domain" 1.1.1.1 &>/dev/null && \
|
||||
print_success "DNS resolution working for ACME challenges" || \
|
||||
print_warning "DNS resolution issues may affect ACME challenges"
|
||||
######################################
|
||||
# Fix stuck certificates and cleanup
|
||||
######################################
|
||||
|
||||
print_info "Wildcard certificate creation initiated. This may take some time to complete depending on DNS propagation."
|
||||
needs_restart=false
|
||||
|
||||
# STEP 1: Fix certificates stuck with 404 errors FIRST (before cleaning up orders)
|
||||
print_info "Checking for certificates with failed issuance attempts..."
|
||||
stuck_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.conditions[]? | select(.type=="Issuing" and .status=="False" and (.message | contains("404")))) | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$stuck_certs" ]; then
|
||||
print_warning "Found certificates stuck with non-existent orders, recreating them..."
|
||||
echo "$stuck_certs" | while read ns name; do
|
||||
print_info "Recreating certificate $ns/$name..."
|
||||
# Get just the spec
|
||||
cert_spec=$(kubectl get certificate "$name" -n "$ns" -o json | jq '.spec')
|
||||
# Delete the certificate
|
||||
kubectl delete certificate "$name" -n "$ns"
|
||||
# Recreate with clean state
|
||||
echo "{\"apiVersion\":\"cert-manager.io/v1\",\"kind\":\"Certificate\",\"metadata\":{\"name\":\"$name\",\"namespace\":\"$ns\"},\"spec\":$cert_spec}" | kubectl apply -f -
|
||||
done
|
||||
needs_restart=true
|
||||
# Give cert-manager time to process the recreated certificates
|
||||
sleep 5
|
||||
else
|
||||
print_success "No certificates stuck with failed orders"
|
||||
fi
|
||||
|
||||
# STEP 2: Clean up orphaned orders (after fixing certificates)
|
||||
print_info "Checking for orphaned ACME orders..."
|
||||
|
||||
# Check logs for 404 errors
|
||||
orphaned_orders=$(kubectl logs -n cert-manager deployment/cert-manager --tail=200 2>/dev/null | \
|
||||
grep -E "failed to retrieve the ACME order.*404" 2>/dev/null | \
|
||||
sed -n 's/.*resource_name="\([^"]*\)".*/\1/p' | \
|
||||
sort -u || true)
|
||||
|
||||
if [ -n "$orphaned_orders" ]; then
|
||||
print_warning "Found orphaned ACME orders from logs"
|
||||
for order in $orphaned_orders; do
|
||||
print_info "Deleting orphaned order: $order"
|
||||
# Find and delete the order in whatever namespace it exists
|
||||
orders_found=$(kubectl get orders --all-namespaces 2>/dev/null | grep "$order" 2>/dev/null || true)
|
||||
if [ -n "$orders_found" ]; then
|
||||
echo "$orders_found" | while read ns name rest; do
|
||||
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
fi
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
print_success "No orphaned orders found in logs"
|
||||
fi
|
||||
|
||||
# Check for errored state orders
|
||||
errored_orders=$(kubectl get orders --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.state == "errored") | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$errored_orders" ]; then
|
||||
print_warning "Found errored ACME orders"
|
||||
echo "$errored_orders" | while read ns name; do
|
||||
print_info "Deleting errored order: $ns/$name"
|
||||
kubectl delete order "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
print_success "No errored orders found"
|
||||
fi
|
||||
|
||||
# STEP 3: Clean up bad challenges
|
||||
print_info "Checking for stuck ACME challenges..."
|
||||
|
||||
# Delete expired, invalid, or errored challenges
|
||||
bad_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r '.items[] | select(.status.state == "expired" or .status.state == "invalid" or .status.state == "errored") | "\(.metadata.namespace) \(.metadata.name) \(.status.state)"')
|
||||
|
||||
if [ -n "$bad_challenges" ]; then
|
||||
print_warning "Found stuck ACME challenges"
|
||||
echo "$bad_challenges" | while read ns name state; do
|
||||
print_info "Deleting $state challenge: $ns/$name"
|
||||
kubectl delete challenge "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
needs_restart=true
|
||||
else
|
||||
print_success "No stuck challenges found"
|
||||
fi
|
||||
|
||||
# Delete very old challenges (over 1 hour) - only if they exist
|
||||
all_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | jq '.items | length' || echo 0)
|
||||
if [ "$all_challenges" -gt 0 ]; then
|
||||
old_challenges=$(kubectl get challenges --all-namespaces -o json 2>/dev/null | \
|
||||
jq -r --arg cutoff "$(date -u -d '1 hour ago' '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || date -u -v-1H '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null)" \
|
||||
'.items[] | select(.metadata.creationTimestamp < $cutoff) | "\(.metadata.namespace) \(.metadata.name)"')
|
||||
|
||||
if [ -n "$old_challenges" ]; then
|
||||
print_warning "Found old challenges (over 1 hour)"
|
||||
echo "$old_challenges" | while read ns name; do
|
||||
print_info "Deleting old challenge: $ns/$name"
|
||||
kubectl delete challenge "$name" -n "$ns" 2>/dev/null || true
|
||||
done
|
||||
needs_restart=true
|
||||
fi
|
||||
fi
|
||||
|
||||
# STEP 4: Check for DNS errors
|
||||
dns_errors=$(kubectl logs -n cert-manager deployment/cert-manager --tail=50 2>/dev/null | \
|
||||
grep "Could not route to /client/v4/zones/dns_records" | wc -l | tr -d '\n' || echo "0")
|
||||
dns_errors=${dns_errors:-0}
|
||||
|
||||
if [ "$dns_errors" -gt 0 ]; then
|
||||
print_warning "Cert-manager has DNS record cleanup errors"
|
||||
needs_restart=true
|
||||
fi
|
||||
|
||||
# STEP 5: Single restart if anything needs cleaning
|
||||
if [ "$needs_restart" = true ]; then
|
||||
print_info "Restarting cert-manager once to clear all internal state..."
|
||||
kubectl rollout restart deployment cert-manager -n cert-manager
|
||||
kubectl rollout status deployment/cert-manager -n cert-manager --timeout=120s
|
||||
# Give cert-manager time to reinitialize
|
||||
sleep 10
|
||||
else
|
||||
print_success "No restart needed - cert-manager state is clean"
|
||||
fi
|
||||
|
||||
|
||||
##################################
|
||||
# Handle certificate renewal
|
||||
##################################
|
||||
|
||||
# Check for expired or near-expiry certificates and trigger renewal
|
||||
print_info "Checking certificate expiration status..."
|
||||
current_date=$(date +%s)
|
||||
|
||||
# Track if we found any issues
|
||||
found_expired=false
|
||||
found_expiring_soon=false
|
||||
all_certs_valid=true
|
||||
|
||||
# Process certificates and collect their status
|
||||
while IFS= read -r line; do
|
||||
ns=$(echo "$line" | awk '{print $1}')
|
||||
name=$(echo "$line" | awk '{print $2}')
|
||||
secret=$(echo "$line" | awk '{print $3}')
|
||||
expiry=$(echo "$line" | awk '{print $4}')
|
||||
|
||||
if [ "$expiry" != "unknown" ] && [ "$expiry" != "null" ] && [ "$expiry" != "" ]; then
|
||||
expiry_ts=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$expiry" +%s 2>/dev/null || echo 0)
|
||||
if [ "$expiry_ts" -gt 0 ]; then
|
||||
days_until_expiry=$(( (expiry_ts - current_date) / 86400 ))
|
||||
|
||||
if [ "$days_until_expiry" -lt 0 ]; then
|
||||
print_warning "Certificate $ns/$name has EXPIRED (expired ${days_until_expiry#-} days ago)"
|
||||
if [ -n "$secret" ] && [ "$secret" != "unknown" ] && [ "$secret" != "null" ]; then
|
||||
print_info "Deleting secret $secret to trigger renewal..."
|
||||
kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true
|
||||
found_expired=true
|
||||
all_certs_valid=false
|
||||
fi
|
||||
elif [ "$days_until_expiry" -lt 7 ]; then
|
||||
print_warning "Certificate $ns/$name expires in $days_until_expiry days"
|
||||
if [ "$days_until_expiry" -lt 3 ]; then
|
||||
# Force renewal for certificates expiring very soon
|
||||
if [ -n "$secret" ] && [ "$secret" != "unknown" ] && [ "$secret" != "null" ]; then
|
||||
print_info "Forcing renewal by deleting secret $secret..."
|
||||
kubectl delete secret "$secret" -n "$ns" 2>/dev/null || true
|
||||
found_expiring_soon=true
|
||||
all_certs_valid=false
|
||||
fi
|
||||
else
|
||||
print_info "Will renew automatically when closer to expiry"
|
||||
fi
|
||||
elif [ "$days_until_expiry" -lt 30 ]; then
|
||||
print_info "Certificate $ns/$name expires in $days_until_expiry days (renewal not needed yet)"
|
||||
else
|
||||
print_success "Certificate $ns/$name is valid for $days_until_expiry days"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# Certificate has no expiry (being issued)
|
||||
print_info "Certificate $ns/$name is currently being issued..."
|
||||
fi
|
||||
done < <(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | "\(.metadata.namespace) \(.metadata.name) \(.spec.secretName) \(.status.notAfter // "unknown")"')
|
||||
|
||||
if [ "$all_certs_valid" = true ]; then
|
||||
print_success "All certificates are valid - no renewals needed"
|
||||
fi
|
||||
|
||||
|
||||
#########################
|
||||
# Final checks
|
||||
#########################
|
||||
|
||||
# Wait for the certificates to be issued (with a timeout)
|
||||
print_info "Waiting for wildcard certificates to be ready (this may take several minutes)..."
|
||||
kubectl wait --for=condition=Ready certificate wildcard-internal-wild-cloud -n cert-manager --timeout=300s || true
|
||||
kubectl wait --for=condition=Ready certificate wildcard-wild-cloud -n cert-manager --timeout=300s || true
|
||||
|
||||
# Final health check
|
||||
print_info "Performing final cert-manager health check..."
|
||||
failed_certs=$(kubectl get certificates --all-namespaces -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "\(.metadata.namespace)/\(.metadata.name)"' | wc -l)
|
||||
if [ "$failed_certs" -gt 0 ]; then
|
||||
print_warning "Found $failed_certs certificates not in Ready state"
|
||||
print_info "Check certificate status with: kubectl get certificates --all-namespaces"
|
||||
print_info "Check cert-manager logs with: kubectl logs -n cert-manager deployment/cert-manager"
|
||||
else
|
||||
print_success "All certificates are in Ready state"
|
||||
fi
|
||||
|
||||
print_success "cert-manager setup complete!"
|
||||
echo ""
|
||||
print_info "To verify the installation:"
|
||||
print_info " kubectl get pods -n cert-manager"
|
||||
print_info " kubectl get clusterissuers"
|
||||
print_info " kubectl get certificates -n cert-manager"
|
||||
|
||||
@@ -13,7 +13,6 @@ spec:
|
||||
# DNS-01 solver for wildcard certificates
|
||||
- dns01:
|
||||
cloudflare:
|
||||
email: {{ .operator.email }}
|
||||
apiTokenSecretRef:
|
||||
name: cloudflare-api-token
|
||||
key: api-token
|
||||
|
||||
@@ -13,7 +13,6 @@ spec:
|
||||
# DNS-01 solver for wildcard certificates
|
||||
- dns01:
|
||||
cloudflare:
|
||||
email: {{ .operator.email }}
|
||||
apiTokenSecretRef:
|
||||
name: cloudflare-api-token
|
||||
key: api-token
|
||||
|
||||
23
setup/home-scaffold/.envrc
Normal file
23
setup/home-scaffold/.envrc
Normal file
@@ -0,0 +1,23 @@
|
||||
# Set the WC_HOME environment variable to this script's directory.
|
||||
# This variable is used consistently across the Wild Config scripts.
|
||||
export WC_HOME="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
|
||||
|
||||
# Add bin to path first so wild-config is available
|
||||
# export PATH="$WC_HOME/bin:$PATH"
|
||||
|
||||
export KUBECONFIG=$WC_HOME/.kubeconfig
|
||||
export TALOSCONFIG=$WC_HOME/setup/cluster-nodes/generated/talosconfig
|
||||
|
||||
# Backup configuration.
|
||||
if `wild-config cloud.backup.root --check`; then
|
||||
export RESTIC_REPOSITORY="$(wild-config cloud.backup.root)"
|
||||
else
|
||||
echo "WARNING: Could not get cloud backup root."
|
||||
fi
|
||||
|
||||
if `wild-secret cloud.backupPassword --check`; then
|
||||
export RESTIC_PASSWORD="$(wild-secret cloud.backupPassword)"
|
||||
else
|
||||
echo "WARNING: Could not get cloud backup secret."
|
||||
fi
|
||||
|
||||
2
setup/home-scaffold/.gitignore
vendored
2
setup/home-scaffold/.gitignore
vendored
@@ -1,6 +1,6 @@
|
||||
.wildcloud
|
||||
secrets.yaml
|
||||
.bots/*/sessions
|
||||
backup/
|
||||
.working
|
||||
setup/cluster-nodes/generated/talosconfig
|
||||
.kubeconfig
|
||||
|
||||
@@ -16,8 +16,7 @@ wild-setup
|
||||
|
||||
```bash
|
||||
wild-apps-list
|
||||
wild-app-fetch <app>
|
||||
wild-app-config <app>
|
||||
wild-app-add <app>
|
||||
wild-app-deploy <app>
|
||||
# Optional: Check in app templates.
|
||||
```
|
||||
|
||||
@@ -20,16 +20,10 @@ The Wild Cloud app workflow consists of three steps:
|
||||
|
||||
### Commands
|
||||
|
||||
To fetch an app template to cache:
|
||||
To configure an app (reads directly from repository):
|
||||
|
||||
```bash
|
||||
wild-app-fetch <app>
|
||||
```
|
||||
|
||||
To apply your configuration to a cached app (automatically fetches if not cached):
|
||||
|
||||
```bash
|
||||
wild-app-config <app>
|
||||
wild-app-add <app>
|
||||
```
|
||||
|
||||
To deploy a configured app to Kubernetes:
|
||||
|
||||
@@ -1,92 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Set the WC_HOME environment variable to this script's directory.
|
||||
# This variable is used consistently across the Wild Config scripts.
|
||||
export WC_HOME="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
|
||||
|
||||
# Add bin to path first so wild-config is available
|
||||
export PATH="$WC_HOME/bin:$PATH"
|
||||
|
||||
# Install kubectl
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo "Installing kubectl"
|
||||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256"
|
||||
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
|
||||
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
|
||||
rm kubectl kubectl.sha256
|
||||
echo "kubectl installed successfully."
|
||||
fi
|
||||
|
||||
# Install talosctl
|
||||
if ! command -v talosctl &> /dev/null; then
|
||||
echo "Installing talosctl"
|
||||
curl -sL https://talos.dev/install | sh
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error installing talosctl. Please check the installation script."
|
||||
exit 1
|
||||
fi
|
||||
echo "talosctl installed successfully."
|
||||
fi
|
||||
|
||||
# Check if gomplate is installed
|
||||
if ! command -v gomplate &> /dev/null; then
|
||||
echo "Installing gomplate"
|
||||
curl -sSL https://github.com/hairyhenderson/gomplate/releases/latest/download/gomplate_linux-amd64 -o $HOME/.local/bin/gomplate
|
||||
chmod +x $HOME/.local/bin/gomplate
|
||||
echo "gomplate installed successfully."
|
||||
fi
|
||||
|
||||
# Install kustomize
|
||||
if ! command -v kustomize &> /dev/null; then
|
||||
echo "Installing kustomize"
|
||||
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
|
||||
mv kustomize $HOME/.local/bin/
|
||||
echo "kustomize installed successfully."
|
||||
fi
|
||||
|
||||
## Install yq
|
||||
if ! command -v yq &> /dev/null; then
|
||||
echo "Installing yq"
|
||||
VERSION=v4.45.4
|
||||
BINARY=yq_linux_amd64
|
||||
wget https://github.com/mikefarah/yq/releases/download/${VERSION}/${BINARY}.tar.gz -O - | tar xz
|
||||
mv ${BINARY} $HOME/.local/bin/yq
|
||||
chmod +x $HOME/.local/bin/yq
|
||||
rm yq.1
|
||||
echo "yq installed successfully."
|
||||
fi
|
||||
|
||||
KUBECONFIG=~/.kube/config
|
||||
export KUBECONFIG
|
||||
|
||||
# Use cluster name as both talos and kubectl context name
|
||||
CLUSTER_NAME=$(wild-config cluster.name)
|
||||
if [ -z "${CLUSTER_NAME}" ] || [ "${CLUSTER_NAME}" = "null" ]; then
|
||||
echo "Error: cluster.name not set in config.yaml"
|
||||
else
|
||||
KUBE_CONTEXT="admin@${CLUSTER_NAME}"
|
||||
CURRENT_KUBE_CONTEXT=$(kubectl config current-context)
|
||||
if [ "${CURRENT_KUBE_CONTEXT}" != "${KUBE_CONTEXT}" ]; then
|
||||
if kubectl config get-contexts | grep -q "${KUBE_CONTEXT}"; then
|
||||
echo "Switching to kubernetes context ${KUBE_CONTEXT}"
|
||||
else
|
||||
echo "WARNING: Context ${KUBE_CONTEXT} does not exist."
|
||||
# kubectl config set-context "${KUBE_CONTEXT}" --cluster="${CLUSTER_NAME}" --user=admin
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Backup configuration.
|
||||
if `wild-config cloud.backup.root --check`; then
|
||||
export RESTIC_REPOSITORY="$(wild-config cloud.backup.root)"
|
||||
else
|
||||
echo "WARNING: Could not get cloud backup root."
|
||||
fi
|
||||
|
||||
if `wild-secret cloud.backupPassword --check`; then
|
||||
export RESTIC_PASSWORD="$(wild-secret cloud.backupPassword)"
|
||||
else
|
||||
echo "WARNING: Could not get cloud backup secret."
|
||||
fi
|
||||
@@ -6,7 +6,6 @@ This directory is a test Wild Cloud home for debugging scripts and commands.
|
||||
|
||||
```bash
|
||||
cd test/test-cloud
|
||||
wild-app-fetch <app-name>
|
||||
wild-app-add <app-name>
|
||||
wild-app-deploy <app-name>
|
||||
# etc.
|
||||
|
||||
Reference in New Issue
Block a user