- infra/ROLLBACK.md: comprehensive rollback runbook with ECS, Docker Compose, database migration, blue-green, and emergency rollback procedures - infra/scripts/rollback.sh: enhanced ECS rollback with validation, logging, health verification, and per-service rollback support - infra/scripts/rollback-compose.sh: Docker Compose rollback for local/staging - infra/scripts/rollback-migration.sh: Drizzle migration rollback with AWS Secrets Manager integration - infra/scripts/test-rollback.sh: automated test suite (51 tests) - Updated infra/README.md to reference ROLLBACK.md Co-Authored-By: Paperclip <noreply@paperclip.ing>
256 lines
7.0 KiB
Bash
Executable File
256 lines
7.0 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# ShieldAI ECS Rollback Script
|
|
# Usage: ./rollback.sh <environment> <service|all> [--verify]
|
|
#
|
|
# Environments: staging, production
|
|
# Services: api, darkwatch, spamshield, voiceprint, all
|
|
#
|
|
# Examples:
|
|
# ./rollback.sh staging api # Rollback single service
|
|
# ./rollback.sh production all # Rollback all services
|
|
# ./rollback.sh production all --verify # Rollback with post-verification
|
|
|
|
# ─── Configuration ───────────────────────────────────────────────
|
|
ENVIRONMENT="${1:-staging}"
|
|
SERVICE="${2:-all}"
|
|
VERIFY="${3:-false}"
|
|
|
|
CLUSTER="shieldai-${ENVIRONMENT}"
|
|
SERVICES_LIST="api darkwatch spamshield voiceprint"
|
|
EXIT_CODE=0
|
|
TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
|
|
LOG_FILE="/tmp/shieldai-rollback-${ENVIRONMENT}-${TIMESTAMP//[: ]/_}.log"
|
|
|
|
# ─── Helpers ─────────────────────────────────────────────────────
|
|
log() {
|
|
local level="$1"
|
|
shift
|
|
local msg="$*"
|
|
echo "[$(date -u '+%H:%M:%S')] [$level] $msg" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
log_info() { log "INFO" "$@"; }
|
|
log_warn() { log "WARN" "$@"; }
|
|
log_error() { log "ERROR" "$@"; }
|
|
|
|
# ─── Validation ──────────────────────────────────────────────────
|
|
validate_environment() {
|
|
if [[ "$ENVIRONMENT" != "staging" && "$ENVIRONMENT" != "production" ]]; then
|
|
log_error "Invalid environment: $ENVIRONMENT (expected: staging, production)"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
validate_service() {
|
|
if [[ "$SERVICE" == "all" ]]; then
|
|
return 0
|
|
fi
|
|
if ! echo "$SERVICES_LIST" | grep -qw "$SERVICE"; then
|
|
log_error "Invalid service: $SERVICE (expected: api, darkwatch, spamshield, voiceprint, all)"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
check_prerequisites() {
|
|
local missing=()
|
|
|
|
for cmd in aws jq curl; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
missing+=("$cmd")
|
|
fi
|
|
done
|
|
|
|
if [[ ${#missing[@]} -gt 0 ]]; then
|
|
log_error "Missing prerequisites: ${missing[*]}"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -z "${AWS_DEFAULT_REGION:-}" ]]; then
|
|
export AWS_DEFAULT_REGION="us-east-1"
|
|
fi
|
|
|
|
log_info "Prerequisites OK (region: $AWS_DEFAULT_REGION)"
|
|
}
|
|
|
|
# ─── Rollback Logic ──────────────────────────────────────────────
|
|
get_target_services() {
|
|
if [[ "$SERVICE" == "all" ]]; then
|
|
echo "$SERVICES_LIST"
|
|
else
|
|
echo "$SERVICE"
|
|
fi
|
|
}
|
|
|
|
rollback_service() {
|
|
local svc="$1"
|
|
local service_name="${CLUSTER}-${svc}"
|
|
|
|
log_info "Rolling back $service_name..."
|
|
|
|
# Check current deployment status
|
|
local current_task_def
|
|
current_task_def=$(aws ecs describe-services \
|
|
--cluster "$CLUSTER" \
|
|
--services "$service_name" \
|
|
--query 'services[0].taskDefinition' \
|
|
--output text 2>/dev/null || echo "UNKNOWN")
|
|
|
|
log_info "Current task definition: $current_task_def"
|
|
|
|
# Execute rollback
|
|
if aws ecs update-service \
|
|
--cluster "$CLUSTER" \
|
|
--service "$service_name" \
|
|
--rollback \
|
|
--no-cli-auto-prompt 2>>"$LOG_FILE"; then
|
|
log_info "Rollback initiated for $service_name"
|
|
else
|
|
log_error "Rollback failed to initiate for $service_name"
|
|
EXIT_CODE=1
|
|
return 1
|
|
fi
|
|
|
|
# Wait for stabilization (max 5 minutes)
|
|
log_info "Waiting for $service_name to stabilize (timeout: 300s)..."
|
|
if aws ecs wait services-stable \
|
|
--cluster "$CLUSTER" \
|
|
--services "$service_name" \
|
|
--timeout 300 2>>"$LOG_FILE"; then
|
|
log_info "$service_name stabilized successfully"
|
|
else
|
|
log_warn "$service_name stabilization timed out or failed"
|
|
EXIT_CODE=1
|
|
return 1
|
|
fi
|
|
|
|
# Get new task definition after rollback
|
|
local new_task_def
|
|
new_task_def=$(aws ecs describe-services \
|
|
--cluster "$CLUSTER" \
|
|
--services "$service_name" \
|
|
--query 'services[0].taskDefinition' \
|
|
--output text 2>/dev/null || echo "UNKNOWN")
|
|
|
|
local running_count
|
|
running_count=$(aws ecs describe-services \
|
|
--cluster "$CLUSTER" \
|
|
--services "$service_name" \
|
|
--query 'services[0].runningCount' \
|
|
--output text 2>/dev/null || echo "0")
|
|
|
|
local desired_count
|
|
desired_count=$(aws ecs describe-services \
|
|
--cluster "$CLUSTER" \
|
|
--services "$service_name" \
|
|
--query 'services[0].desiredCount' \
|
|
--output text 2>/dev/null || echo "0")
|
|
|
|
log_info "Rollback complete: $service_name -> $new_task_def ($running_count/$desired_count running)"
|
|
|
|
return 0
|
|
}
|
|
|
|
# ─── Health Verification ─────────────────────────────────────────
|
|
verify_health() {
|
|
local svc="$1"
|
|
local port
|
|
port=$(case "$svc" in
|
|
api) echo 3000 ;;
|
|
darkwatch) echo 3001 ;;
|
|
spamshield) echo 3002 ;;
|
|
voiceprint) echo 3003 ;;
|
|
*) echo 3000 ;;
|
|
esac)
|
|
|
|
local alb_dns="https://${CLUSTER}-alb.${AWS_DEFAULT_REGION}.elb.amazonaws.com"
|
|
|
|
log_info "Verifying health for $svc (ALB: $alb_dns)..."
|
|
|
|
local http_code
|
|
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
|
--connect-timeout 10 \
|
|
--max-time 30 \
|
|
"$alb_dns/health" 2>/dev/null || echo "000")
|
|
|
|
if [[ "$http_code" == "200" ]]; then
|
|
log_info "Health check PASSED: $svc (HTTP $http_code)"
|
|
return 0
|
|
else
|
|
log_warn "Health check FAILED: $svc (HTTP $http_code)"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
verify_all_services() {
|
|
log_info "=== Post-Rollback Health Verification ==="
|
|
local passed=0
|
|
local failed=0
|
|
|
|
for svc in $(get_target_services); do
|
|
if verify_health "$svc"; then
|
|
((passed++))
|
|
else
|
|
((failed++))
|
|
fi
|
|
done
|
|
|
|
log_info "Verification complete: $passed passed, $failed failed"
|
|
|
|
if [[ $failed -gt 0 ]]; then
|
|
log_warn "Some services failed health verification"
|
|
EXIT_CODE=1
|
|
fi
|
|
}
|
|
|
|
# ─── Main Execution ──────────────────────────────────────────────
|
|
main() {
|
|
log_info "=== ShieldAI Rollback ==="
|
|
log_info "Environment: $ENVIRONMENT"
|
|
log_info "Service(s): $SERVICE"
|
|
log_info "Cluster: $CLUSTER"
|
|
log_info "Verify: $VERIFY"
|
|
log_info "Timestamp: $TIMESTAMP"
|
|
log_info "Log file: $LOG_FILE"
|
|
log_info "=========================="
|
|
|
|
# Validate inputs
|
|
validate_environment
|
|
validate_service
|
|
check_prerequisites
|
|
|
|
# Execute rollback for each target service
|
|
local rolled_back=0
|
|
local failed=0
|
|
|
|
for svc in $(get_target_services); do
|
|
if rollback_service "$svc"; then
|
|
((rolled_back++))
|
|
else
|
|
((failed++))
|
|
fi
|
|
done
|
|
|
|
log_info "=== Rollback Summary ==="
|
|
log_info "Rolled back: $rolled_back services"
|
|
log_info "Failed: $failed services"
|
|
|
|
# Post-rollback verification
|
|
if [[ "$VERIFY" == "--verify" ]] || [[ "$VERIFY" == "true" ]]; then
|
|
verify_all_services
|
|
fi
|
|
|
|
if [[ $failed -gt 0 ]]; then
|
|
log_error "Rollback completed with $failed failure(s)"
|
|
log_info "Full log: $LOG_FILE"
|
|
exit "$EXIT_CODE"
|
|
fi
|
|
|
|
log_info "Rollback completed successfully"
|
|
log_info "Full log: $LOG_FILE"
|
|
exit 0
|
|
}
|
|
|
|
main "$@"
|