automatic failover

2026-05-31 02:01:37 -04:00
parent 925e37938b
commit d2ef124369
2 changed files with 103 additions and 58 deletions
--- a/README.md
+++ b/README.md
@@ -122,6 +122,10 @@ prompts:
 > tasks, only the first two models are used. The third model is only touched when
 > a third concurrent task starts. Freed model slots are reused before new ones
 > are allocated.
+>
+> **Automatic failover**: if a provider/API is unreachable (rate limit, 503, etc.),
+> the task automatically cycles to the next model in the list without counting it
+> as a task failure. Each model is tried once before the task is marked as failed.

 The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`.

--- a/src/executor.ts
+++ b/src/executor.ts
@@ -54,6 +54,10 @@ class ModelRoundRobin {
 		this.freeSlots = [];
 	}

+	get length(): number {
+		return this.models.length;
+	}
+
 	assign(taskId: string): unknown {
 		let index: number;
 		if (this.freeSlots.length > 0) {
@@ -385,8 +389,8 @@ export async function executeBatch(
 			projectDir,
 			undefined,
 			model,
+			roundRobin,
 		);
-		roundRobin?.release(task.id);
 	}
 }

@@ -483,7 +487,8 @@ async function executeBatchParallel(
 				projectDir,
 				sharedState,
 				assignedModel,
-			).finally(() => roundRobin?.release(task.id)),
+				roundRobin,
+			),
 		});

 		// Limit concurrency
@@ -514,10 +519,24 @@ async function executeTask(
 	projectDir: string = project.sourceDir,
 	parallelState?: ParallelWidgetState,
 	assignedModel?: unknown,
+	roundRobin?: ModelRoundRobin | null,
 ): Promise<void> {
 	const maxRetries = config.execution.maxRetries;
-	let retries = 0;

+	// Model failover: when a provider/API is down, cycle through available models.
+	// result.success === false always means an agent-session failure (API error,
+	// provider unreachable, etc.), not a task-work error.
+	const maxModelAttempts = roundRobin ? roundRobin.length : 1;
+	let modelAttempt = 0;
+	let currentModel: unknown = assignedModel ?? config.model;
+
+	while (modelAttempt < maxModelAttempts) {
+		// Get the next model from round-robin (on first try, use the pre-assigned model)
+		if (modelAttempt > 0 && roundRobin) {
+			currentModel = roundRobin.assign(task.id);
+		}
+
+		let retries = 0;
 		while (retries <= maxRetries) {
 			try {
 				// Mark as in progress
@@ -538,7 +557,7 @@ async function executeTask(
 					sendChatMessage,
 					projectDir,
 					parallelState,
-				assignedModel,
+					currentModel,
 				);

 				if (result.success) {
@@ -558,10 +577,23 @@ async function executeTask(
 						result.commitMessages,
 						result.commitSummary,
 					);
+					roundRobin?.release(task.id);
 					return;
 				}

-			// Task failed, check if we should retry
+				// Agent session failed (provider error).
+				// If we have more models, cycle immediately — don't waste retries.
+				if (roundRobin && modelAttempt < maxModelAttempts - 1) {
+					roundRobin.release(task.id);
+					modelAttempt++;
+					ctx.ui.notify(
+						`Task ${task.id}: model failed, trying next (${modelAttempt + 1}/${maxModelAttempts}): ${result.error}`,
+						"warning",
+					);
+					break; // exit retry loop, cycle to next model
+				}
+
+				// No more models — use normal retry logic
 				if (retries < maxRetries) {
 					retries = progress.incrementRetry(task.id);
 					ctx.ui.notify(
@@ -578,11 +610,20 @@ async function executeTask(
 					throw new Error(`Task ${task.id} failed: ${result.error}`);
 				}
 			} catch (error) {
+				roundRobin?.release(task.id);
 				const errorMsg = error instanceof Error ? error.message : String(error);
 				progress.markFailed(task.id, errorMsg);
 				throw error;
 			}
 		}
+
+		// If we broke out (model cycling), continue the outer loop
+		modelAttempt++;
+	}
+
+	// All models exhausted
+	progress.markFailed(task.id, "All configured models exhausted");
+	throw new Error(`Task ${task.id} failed: all configured models exhausted`);
 }

 // ─── Save Reflection to File ────────────────────────────────────────────────