#!/bin/bash set -euo pipefail # ShieldAI ECS Rollback Script # Usage: ./rollback.sh [--verify] # # Environments: staging, production # Services: api, darkwatch, spamshield, voiceprint, all # # Examples: # ./rollback.sh staging api # Rollback single service # ./rollback.sh production all # Rollback all services # ./rollback.sh production all --verify # Rollback with post-verification # ─── Configuration ─────────────────────────────────────────────── ENVIRONMENT="${1:-staging}" SERVICE="${2:-all}" VERIFY="${3:-false}" CLUSTER="shieldai-${ENVIRONMENT}" SERVICES_LIST="api darkwatch spamshield voiceprint" EXIT_CODE=0 TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC') LOG_FILE="/tmp/shieldai-rollback-${ENVIRONMENT}-${TIMESTAMP//[: ]/_}.log" # ─── Helpers ───────────────────────────────────────────────────── log() { local level="$1" shift local msg="$*" echo "[$(date -u '+%H:%M:%S')] [$level] $msg" | tee -a "$LOG_FILE" } log_info() { log "INFO" "$@"; } log_warn() { log "WARN" "$@"; } log_error() { log "ERROR" "$@"; } # ─── Validation ────────────────────────────────────────────────── validate_environment() { if [[ "$ENVIRONMENT" != "staging" && "$ENVIRONMENT" != "production" ]]; then log_error "Invalid environment: $ENVIRONMENT (expected: staging, production)" exit 1 fi } validate_service() { if [[ "$SERVICE" == "all" ]]; then return 0 fi if ! echo "$SERVICES_LIST" | grep -qw "$SERVICE"; then log_error "Invalid service: $SERVICE (expected: api, darkwatch, spamshield, voiceprint, all)" exit 1 fi } check_prerequisites() { local missing=() for cmd in aws jq curl; do if ! command -v "$cmd" &>/dev/null; then missing+=("$cmd") fi done if [[ ${#missing[@]} -gt 0 ]]; then log_error "Missing prerequisites: ${missing[*]}" exit 1 fi if [[ -z "${AWS_DEFAULT_REGION:-}" ]]; then export AWS_DEFAULT_REGION="us-east-1" fi log_info "Prerequisites OK (region: $AWS_DEFAULT_REGION)" } # ─── Rollback Logic ────────────────────────────────────────────── get_target_services() { if [[ "$SERVICE" == "all" ]]; then echo "$SERVICES_LIST" else echo "$SERVICE" fi } rollback_service() { local svc="$1" local service_name="${CLUSTER}-${svc}" log_info "Rolling back $service_name..." # Check current deployment status local current_task_def current_task_def=$(aws ecs describe-services \ --cluster "$CLUSTER" \ --services "$service_name" \ --query 'services[0].taskDefinition' \ --output text 2>/dev/null || echo "UNKNOWN") log_info "Current task definition: $current_task_def" # Execute rollback if aws ecs update-service \ --cluster "$CLUSTER" \ --service "$service_name" \ --rollback \ --no-cli-auto-prompt 2>>"$LOG_FILE"; then log_info "Rollback initiated for $service_name" else log_error "Rollback failed to initiate for $service_name" EXIT_CODE=1 return 1 fi # Wait for stabilization (max 5 minutes) log_info "Waiting for $service_name to stabilize (timeout: 300s)..." if aws ecs wait services-stable \ --cluster "$CLUSTER" \ --services "$service_name" \ --timeout 300 2>>"$LOG_FILE"; then log_info "$service_name stabilized successfully" else log_warn "$service_name stabilization timed out or failed" EXIT_CODE=1 return 1 fi # Get new task definition after rollback local new_task_def new_task_def=$(aws ecs describe-services \ --cluster "$CLUSTER" \ --services "$service_name" \ --query 'services[0].taskDefinition' \ --output text 2>/dev/null || echo "UNKNOWN") local running_count running_count=$(aws ecs describe-services \ --cluster "$CLUSTER" \ --services "$service_name" \ --query 'services[0].runningCount' \ --output text 2>/dev/null || echo "0") local desired_count desired_count=$(aws ecs describe-services \ --cluster "$CLUSTER" \ --services "$service_name" \ --query 'services[0].desiredCount' \ --output text 2>/dev/null || echo "0") log_info "Rollback complete: $service_name -> $new_task_def ($running_count/$desired_count running)" return 0 } # ─── Health Verification ───────────────────────────────────────── verify_health() { local svc="$1" local port port=$(case "$svc" in api) echo 3000 ;; darkwatch) echo 3001 ;; spamshield) echo 3002 ;; voiceprint) echo 3003 ;; *) echo 3000 ;; esac) local alb_dns="https://${CLUSTER}-alb.${AWS_DEFAULT_REGION}.elb.amazonaws.com" log_info "Verifying health for $svc (ALB: $alb_dns)..." local http_code http_code=$(curl -s -o /dev/null -w "%{http_code}" \ --connect-timeout 10 \ --max-time 30 \ "$alb_dns/health" 2>/dev/null || echo "000") if [[ "$http_code" == "200" ]]; then log_info "Health check PASSED: $svc (HTTP $http_code)" return 0 else log_warn "Health check FAILED: $svc (HTTP $http_code)" return 1 fi } verify_all_services() { log_info "=== Post-Rollback Health Verification ===" local passed=0 local failed=0 for svc in $(get_target_services); do if verify_health "$svc"; then ((passed++)) else ((failed++)) fi done log_info "Verification complete: $passed passed, $failed failed" if [[ $failed -gt 0 ]]; then log_warn "Some services failed health verification" EXIT_CODE=1 fi } # ─── Main Execution ────────────────────────────────────────────── main() { log_info "=== ShieldAI Rollback ===" log_info "Environment: $ENVIRONMENT" log_info "Service(s): $SERVICE" log_info "Cluster: $CLUSTER" log_info "Verify: $VERIFY" log_info "Timestamp: $TIMESTAMP" log_info "Log file: $LOG_FILE" log_info "==========================" # Validate inputs validate_environment validate_service check_prerequisites # Execute rollback for each target service local rolled_back=0 local failed=0 for svc in $(get_target_services); do if rollback_service "$svc"; then ((rolled_back++)) else ((failed++)) fi done log_info "=== Rollback Summary ===" log_info "Rolled back: $rolled_back services" log_info "Failed: $failed services" # Post-rollback verification if [[ "$VERIFY" == "--verify" ]] || [[ "$VERIFY" == "true" ]]; then verify_all_services fi if [[ $failed -gt 0 ]]; then log_error "Rollback completed with $failed failure(s)" log_info "Full log: $LOG_FILE" exit "$EXIT_CODE" fi log_info "Rollback completed successfully" log_info "Full log: $LOG_FILE" exit 0 } main "$@"