Files
ShieldAI/infra/scripts/rollback.sh
Michael Freno bce4787802 Add rollback procedure documentation and testing scripts (FRE-4808)
- infra/ROLLBACK.md: comprehensive rollback runbook with ECS, Docker Compose,
  database migration, blue-green, and emergency rollback procedures
- infra/scripts/rollback.sh: enhanced ECS rollback with validation, logging,
  health verification, and per-service rollback support
- infra/scripts/rollback-compose.sh: Docker Compose rollback for local/staging
- infra/scripts/rollback-migration.sh: Drizzle migration rollback with
  AWS Secrets Manager integration
- infra/scripts/test-rollback.sh: automated test suite (51 tests)
- Updated infra/README.md to reference ROLLBACK.md

Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-05-09 06:27:31 -04:00

256 lines
7.0 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# ShieldAI ECS Rollback Script
# Usage: ./rollback.sh <environment> <service|all> [--verify]
#
# Environments: staging, production
# Services: api, darkwatch, spamshield, voiceprint, all
#
# Examples:
# ./rollback.sh staging api # Rollback single service
# ./rollback.sh production all # Rollback all services
# ./rollback.sh production all --verify # Rollback with post-verification
# ─── Configuration ───────────────────────────────────────────────
ENVIRONMENT="${1:-staging}"
SERVICE="${2:-all}"
VERIFY="${3:-false}"
CLUSTER="shieldai-${ENVIRONMENT}"
SERVICES_LIST="api darkwatch spamshield voiceprint"
EXIT_CODE=0
TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
LOG_FILE="/tmp/shieldai-rollback-${ENVIRONMENT}-${TIMESTAMP//[: ]/_}.log"
# ─── Helpers ─────────────────────────────────────────────────────
log() {
local level="$1"
shift
local msg="$*"
echo "[$(date -u '+%H:%M:%S')] [$level] $msg" | tee -a "$LOG_FILE"
}
log_info() { log "INFO" "$@"; }
log_warn() { log "WARN" "$@"; }
log_error() { log "ERROR" "$@"; }
# ─── Validation ──────────────────────────────────────────────────
validate_environment() {
if [[ "$ENVIRONMENT" != "staging" && "$ENVIRONMENT" != "production" ]]; then
log_error "Invalid environment: $ENVIRONMENT (expected: staging, production)"
exit 1
fi
}
validate_service() {
if [[ "$SERVICE" == "all" ]]; then
return 0
fi
if ! echo "$SERVICES_LIST" | grep -qw "$SERVICE"; then
log_error "Invalid service: $SERVICE (expected: api, darkwatch, spamshield, voiceprint, all)"
exit 1
fi
}
check_prerequisites() {
local missing=()
for cmd in aws jq curl; do
if ! command -v "$cmd" &>/dev/null; then
missing+=("$cmd")
fi
done
if [[ ${#missing[@]} -gt 0 ]]; then
log_error "Missing prerequisites: ${missing[*]}"
exit 1
fi
if [[ -z "${AWS_DEFAULT_REGION:-}" ]]; then
export AWS_DEFAULT_REGION="us-east-1"
fi
log_info "Prerequisites OK (region: $AWS_DEFAULT_REGION)"
}
# ─── Rollback Logic ──────────────────────────────────────────────
get_target_services() {
if [[ "$SERVICE" == "all" ]]; then
echo "$SERVICES_LIST"
else
echo "$SERVICE"
fi
}
rollback_service() {
local svc="$1"
local service_name="${CLUSTER}-${svc}"
log_info "Rolling back $service_name..."
# Check current deployment status
local current_task_def
current_task_def=$(aws ecs describe-services \
--cluster "$CLUSTER" \
--services "$service_name" \
--query 'services[0].taskDefinition' \
--output text 2>/dev/null || echo "UNKNOWN")
log_info "Current task definition: $current_task_def"
# Execute rollback
if aws ecs update-service \
--cluster "$CLUSTER" \
--service "$service_name" \
--rollback \
--no-cli-auto-prompt 2>>"$LOG_FILE"; then
log_info "Rollback initiated for $service_name"
else
log_error "Rollback failed to initiate for $service_name"
EXIT_CODE=1
return 1
fi
# Wait for stabilization (max 5 minutes)
log_info "Waiting for $service_name to stabilize (timeout: 300s)..."
if aws ecs wait services-stable \
--cluster "$CLUSTER" \
--services "$service_name" \
--timeout 300 2>>"$LOG_FILE"; then
log_info "$service_name stabilized successfully"
else
log_warn "$service_name stabilization timed out or failed"
EXIT_CODE=1
return 1
fi
# Get new task definition after rollback
local new_task_def
new_task_def=$(aws ecs describe-services \
--cluster "$CLUSTER" \
--services "$service_name" \
--query 'services[0].taskDefinition' \
--output text 2>/dev/null || echo "UNKNOWN")
local running_count
running_count=$(aws ecs describe-services \
--cluster "$CLUSTER" \
--services "$service_name" \
--query 'services[0].runningCount' \
--output text 2>/dev/null || echo "0")
local desired_count
desired_count=$(aws ecs describe-services \
--cluster "$CLUSTER" \
--services "$service_name" \
--query 'services[0].desiredCount' \
--output text 2>/dev/null || echo "0")
log_info "Rollback complete: $service_name -> $new_task_def ($running_count/$desired_count running)"
return 0
}
# ─── Health Verification ─────────────────────────────────────────
verify_health() {
local svc="$1"
local port
port=$(case "$svc" in
api) echo 3000 ;;
darkwatch) echo 3001 ;;
spamshield) echo 3002 ;;
voiceprint) echo 3003 ;;
*) echo 3000 ;;
esac)
local alb_dns="https://${CLUSTER}-alb.${AWS_DEFAULT_REGION}.elb.amazonaws.com"
log_info "Verifying health for $svc (ALB: $alb_dns)..."
local http_code
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
--connect-timeout 10 \
--max-time 30 \
"$alb_dns/health" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]]; then
log_info "Health check PASSED: $svc (HTTP $http_code)"
return 0
else
log_warn "Health check FAILED: $svc (HTTP $http_code)"
return 1
fi
}
verify_all_services() {
log_info "=== Post-Rollback Health Verification ==="
local passed=0
local failed=0
for svc in $(get_target_services); do
if verify_health "$svc"; then
((passed++))
else
((failed++))
fi
done
log_info "Verification complete: $passed passed, $failed failed"
if [[ $failed -gt 0 ]]; then
log_warn "Some services failed health verification"
EXIT_CODE=1
fi
}
# ─── Main Execution ──────────────────────────────────────────────
main() {
log_info "=== ShieldAI Rollback ==="
log_info "Environment: $ENVIRONMENT"
log_info "Service(s): $SERVICE"
log_info "Cluster: $CLUSTER"
log_info "Verify: $VERIFY"
log_info "Timestamp: $TIMESTAMP"
log_info "Log file: $LOG_FILE"
log_info "=========================="
# Validate inputs
validate_environment
validate_service
check_prerequisites
# Execute rollback for each target service
local rolled_back=0
local failed=0
for svc in $(get_target_services); do
if rollback_service "$svc"; then
((rolled_back++))
else
((failed++))
fi
done
log_info "=== Rollback Summary ==="
log_info "Rolled back: $rolled_back services"
log_info "Failed: $failed services"
# Post-rollback verification
if [[ "$VERIFY" == "--verify" ]] || [[ "$VERIFY" == "true" ]]; then
verify_all_services
fi
if [[ $failed -gt 0 ]]; then
log_error "Rollback completed with $failed failure(s)"
log_info "Full log: $LOG_FILE"
exit "$EXIT_CODE"
fi
log_info "Rollback completed successfully"
log_info "Full log: $LOG_FILE"
exit 0
}
main "$@"