Add rollback procedure documentation and testing scripts (FRE-4808)
- infra/ROLLBACK.md: comprehensive rollback runbook with ECS, Docker Compose, database migration, blue-green, and emergency rollback procedures - infra/scripts/rollback.sh: enhanced ECS rollback with validation, logging, health verification, and per-service rollback support - infra/scripts/rollback-compose.sh: Docker Compose rollback for local/staging - infra/scripts/rollback-migration.sh: Drizzle migration rollback with AWS Secrets Manager integration - infra/scripts/test-rollback.sh: automated test suite (51 tests) - Updated infra/README.md to reference ROLLBACK.md Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -1,32 +1,255 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
ENVIRONMENT=${1:-staging}
|
||||
SERVICE=${2:-all}
|
||||
# ShieldAI ECS Rollback Script
|
||||
# Usage: ./rollback.sh <environment> <service|all> [--verify]
|
||||
#
|
||||
# Environments: staging, production
|
||||
# Services: api, darkwatch, spamshield, voiceprint, all
|
||||
#
|
||||
# Examples:
|
||||
# ./rollback.sh staging api # Rollback single service
|
||||
# ./rollback.sh production all # Rollback all services
|
||||
# ./rollback.sh production all --verify # Rollback with post-verification
|
||||
|
||||
# ─── Configuration ───────────────────────────────────────────────
|
||||
ENVIRONMENT="${1:-staging}"
|
||||
SERVICE="${2:-all}"
|
||||
VERIFY="${3:-false}"
|
||||
|
||||
CLUSTER="shieldai-${ENVIRONMENT}"
|
||||
SERVICES_LIST="api darkwatch spamshield voiceprint"
|
||||
EXIT_CODE=0
|
||||
TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
|
||||
LOG_FILE="/tmp/shieldai-rollback-${ENVIRONMENT}-${TIMESTAMP//[: ]/_}.log"
|
||||
|
||||
echo "Rolling back services in cluster: $CLUSTER"
|
||||
# ─── Helpers ─────────────────────────────────────────────────────
|
||||
log() {
|
||||
local level="$1"
|
||||
shift
|
||||
local msg="$*"
|
||||
echo "[$(date -u '+%H:%M:%S')] [$level] $msg" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
SERVICES="api darkwatch spamshield voiceprint"
|
||||
if [ "$SERVICE" != "all" ]; then
|
||||
SERVICES="$SERVICE"
|
||||
fi
|
||||
log_info() { log "INFO" "$@"; }
|
||||
log_warn() { log "WARN" "$@"; }
|
||||
log_error() { log "ERROR" "$@"; }
|
||||
|
||||
for svc in $SERVICES; do
|
||||
echo "Rolling back $svc..."
|
||||
aws ecs update-service \
|
||||
# ─── Validation ──────────────────────────────────────────────────
|
||||
validate_environment() {
|
||||
if [[ "$ENVIRONMENT" != "staging" && "$ENVIRONMENT" != "production" ]]; then
|
||||
log_error "Invalid environment: $ENVIRONMENT (expected: staging, production)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
validate_service() {
|
||||
if [[ "$SERVICE" == "all" ]]; then
|
||||
return 0
|
||||
fi
|
||||
if ! echo "$SERVICES_LIST" | grep -qw "$SERVICE"; then
|
||||
log_error "Invalid service: $SERVICE (expected: api, darkwatch, spamshield, voiceprint, all)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_prerequisites() {
|
||||
local missing=()
|
||||
|
||||
for cmd in aws jq curl; do
|
||||
if ! command -v "$cmd" &>/dev/null; then
|
||||
missing+=("$cmd")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing[@]} -gt 0 ]]; then
|
||||
log_error "Missing prerequisites: ${missing[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${AWS_DEFAULT_REGION:-}" ]]; then
|
||||
export AWS_DEFAULT_REGION="us-east-1"
|
||||
fi
|
||||
|
||||
log_info "Prerequisites OK (region: $AWS_DEFAULT_REGION)"
|
||||
}
|
||||
|
||||
# ─── Rollback Logic ──────────────────────────────────────────────
|
||||
get_target_services() {
|
||||
if [[ "$SERVICE" == "all" ]]; then
|
||||
echo "$SERVICES_LIST"
|
||||
else
|
||||
echo "$SERVICE"
|
||||
fi
|
||||
}
|
||||
|
||||
rollback_service() {
|
||||
local svc="$1"
|
||||
local service_name="${CLUSTER}-${svc}"
|
||||
|
||||
log_info "Rolling back $service_name..."
|
||||
|
||||
# Check current deployment status
|
||||
local current_task_def
|
||||
current_task_def=$(aws ecs describe-services \
|
||||
--cluster "$CLUSTER" \
|
||||
--service "${CLUSTER}-${svc}" \
|
||||
--services "$service_name" \
|
||||
--query 'services[0].taskDefinition' \
|
||||
--output text 2>/dev/null || echo "UNKNOWN")
|
||||
|
||||
log_info "Current task definition: $current_task_def"
|
||||
|
||||
# Execute rollback
|
||||
if aws ecs update-service \
|
||||
--cluster "$CLUSTER" \
|
||||
--service "$service_name" \
|
||||
--rollback \
|
||||
--no-cli-auto-prompt
|
||||
--no-cli-auto-prompt 2>>"$LOG_FILE"; then
|
||||
log_info "Rollback initiated for $service_name"
|
||||
else
|
||||
log_error "Rollback failed to initiate for $service_name"
|
||||
EXIT_CODE=1
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Waiting for $svc to stabilize..."
|
||||
aws ecs wait services-stable \
|
||||
# Wait for stabilization (max 5 minutes)
|
||||
log_info "Waiting for $service_name to stabilize (timeout: 300s)..."
|
||||
if aws ecs wait services-stable \
|
||||
--cluster "$CLUSTER" \
|
||||
--services "${CLUSTER}-${svc}"
|
||||
--services "$service_name" \
|
||||
--timeout 300 2>>"$LOG_FILE"; then
|
||||
log_info "$service_name stabilized successfully"
|
||||
else
|
||||
log_warn "$service_name stabilization timed out or failed"
|
||||
EXIT_CODE=1
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "$svc rolled back successfully"
|
||||
done
|
||||
# Get new task definition after rollback
|
||||
local new_task_def
|
||||
new_task_def=$(aws ecs describe-services \
|
||||
--cluster "$CLUSTER" \
|
||||
--services "$service_name" \
|
||||
--query 'services[0].taskDefinition' \
|
||||
--output text 2>/dev/null || echo "UNKNOWN")
|
||||
|
||||
echo "Rollback complete for $SERVICES"
|
||||
local running_count
|
||||
running_count=$(aws ecs describe-services \
|
||||
--cluster "$CLUSTER" \
|
||||
--services "$service_name" \
|
||||
--query 'services[0].runningCount' \
|
||||
--output text 2>/dev/null || echo "0")
|
||||
|
||||
local desired_count
|
||||
desired_count=$(aws ecs describe-services \
|
||||
--cluster "$CLUSTER" \
|
||||
--services "$service_name" \
|
||||
--query 'services[0].desiredCount' \
|
||||
--output text 2>/dev/null || echo "0")
|
||||
|
||||
log_info "Rollback complete: $service_name -> $new_task_def ($running_count/$desired_count running)"
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# ─── Health Verification ─────────────────────────────────────────
|
||||
verify_health() {
|
||||
local svc="$1"
|
||||
local port
|
||||
port=$(case "$svc" in
|
||||
api) echo 3000 ;;
|
||||
darkwatch) echo 3001 ;;
|
||||
spamshield) echo 3002 ;;
|
||||
voiceprint) echo 3003 ;;
|
||||
*) echo 3000 ;;
|
||||
esac)
|
||||
|
||||
local alb_dns="https://${CLUSTER}-alb.${AWS_DEFAULT_REGION}.elb.amazonaws.com"
|
||||
|
||||
log_info "Verifying health for $svc (ALB: $alb_dns)..."
|
||||
|
||||
local http_code
|
||||
http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
--connect-timeout 10 \
|
||||
--max-time 30 \
|
||||
"$alb_dns/health" 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$http_code" == "200" ]]; then
|
||||
log_info "Health check PASSED: $svc (HTTP $http_code)"
|
||||
return 0
|
||||
else
|
||||
log_warn "Health check FAILED: $svc (HTTP $http_code)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
verify_all_services() {
|
||||
log_info "=== Post-Rollback Health Verification ==="
|
||||
local passed=0
|
||||
local failed=0
|
||||
|
||||
for svc in $(get_target_services); do
|
||||
if verify_health "$svc"; then
|
||||
((passed++))
|
||||
else
|
||||
((failed++))
|
||||
fi
|
||||
done
|
||||
|
||||
log_info "Verification complete: $passed passed, $failed failed"
|
||||
|
||||
if [[ $failed -gt 0 ]]; then
|
||||
log_warn "Some services failed health verification"
|
||||
EXIT_CODE=1
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── Main Execution ──────────────────────────────────────────────
|
||||
main() {
|
||||
log_info "=== ShieldAI Rollback ==="
|
||||
log_info "Environment: $ENVIRONMENT"
|
||||
log_info "Service(s): $SERVICE"
|
||||
log_info "Cluster: $CLUSTER"
|
||||
log_info "Verify: $VERIFY"
|
||||
log_info "Timestamp: $TIMESTAMP"
|
||||
log_info "Log file: $LOG_FILE"
|
||||
log_info "=========================="
|
||||
|
||||
# Validate inputs
|
||||
validate_environment
|
||||
validate_service
|
||||
check_prerequisites
|
||||
|
||||
# Execute rollback for each target service
|
||||
local rolled_back=0
|
||||
local failed=0
|
||||
|
||||
for svc in $(get_target_services); do
|
||||
if rollback_service "$svc"; then
|
||||
((rolled_back++))
|
||||
else
|
||||
((failed++))
|
||||
fi
|
||||
done
|
||||
|
||||
log_info "=== Rollback Summary ==="
|
||||
log_info "Rolled back: $rolled_back services"
|
||||
log_info "Failed: $failed services"
|
||||
|
||||
# Post-rollback verification
|
||||
if [[ "$VERIFY" == "--verify" ]] || [[ "$VERIFY" == "true" ]]; then
|
||||
verify_all_services
|
||||
fi
|
||||
|
||||
if [[ $failed -gt 0 ]]; then
|
||||
log_error "Rollback completed with $failed failure(s)"
|
||||
log_info "Full log: $LOG_FILE"
|
||||
exit "$EXIT_CODE"
|
||||
fi
|
||||
|
||||
log_info "Rollback completed successfully"
|
||||
log_info "Full log: $LOG_FILE"
|
||||
exit 0
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
||||
Reference in New Issue
Block a user