automatic failover

This commit is contained in:
2026-05-31 02:01:37 -04:00
parent 925e37938b
commit d2ef124369
2 changed files with 103 additions and 58 deletions

View File

@@ -122,6 +122,10 @@ prompts:
> tasks, only the first two models are used. The third model is only touched when > tasks, only the first two models are used. The third model is only touched when
> a third concurrent task starts. Freed model slots are reused before new ones > a third concurrent task starts. Freed model slots are reused before new ones
> are allocated. > are allocated.
>
> **Automatic failover**: if a provider/API is unreachable (rate limit, 503, etc.),
> the task automatically cycles to the next model in the list without counting it
> as a task failure. Each model is tried once before the task is marked as failed.
The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`. The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`.

View File

@@ -54,6 +54,10 @@ class ModelRoundRobin {
this.freeSlots = []; this.freeSlots = [];
} }
get length(): number {
return this.models.length;
}
assign(taskId: string): unknown { assign(taskId: string): unknown {
let index: number; let index: number;
if (this.freeSlots.length > 0) { if (this.freeSlots.length > 0) {
@@ -385,8 +389,8 @@ export async function executeBatch(
projectDir, projectDir,
undefined, undefined,
model, model,
roundRobin,
); );
roundRobin?.release(task.id);
} }
} }
@@ -483,7 +487,8 @@ async function executeBatchParallel(
projectDir, projectDir,
sharedState, sharedState,
assignedModel, assignedModel,
).finally(() => roundRobin?.release(task.id)), roundRobin,
),
}); });
// Limit concurrency // Limit concurrency
@@ -514,10 +519,24 @@ async function executeTask(
projectDir: string = project.sourceDir, projectDir: string = project.sourceDir,
parallelState?: ParallelWidgetState, parallelState?: ParallelWidgetState,
assignedModel?: unknown, assignedModel?: unknown,
roundRobin?: ModelRoundRobin | null,
): Promise<void> { ): Promise<void> {
const maxRetries = config.execution.maxRetries; const maxRetries = config.execution.maxRetries;
let retries = 0;
// Model failover: when a provider/API is down, cycle through available models.
// result.success === false always means an agent-session failure (API error,
// provider unreachable, etc.), not a task-work error.
const maxModelAttempts = roundRobin ? roundRobin.length : 1;
let modelAttempt = 0;
let currentModel: unknown = assignedModel ?? config.model;
while (modelAttempt < maxModelAttempts) {
// Get the next model from round-robin (on first try, use the pre-assigned model)
if (modelAttempt > 0 && roundRobin) {
currentModel = roundRobin.assign(task.id);
}
let retries = 0;
while (retries <= maxRetries) { while (retries <= maxRetries) {
try { try {
// Mark as in progress // Mark as in progress
@@ -538,7 +557,7 @@ async function executeTask(
sendChatMessage, sendChatMessage,
projectDir, projectDir,
parallelState, parallelState,
assignedModel, currentModel,
); );
if (result.success) { if (result.success) {
@@ -558,10 +577,23 @@ async function executeTask(
result.commitMessages, result.commitMessages,
result.commitSummary, result.commitSummary,
); );
roundRobin?.release(task.id);
return; return;
} }
// Task failed, check if we should retry // Agent session failed (provider error).
// If we have more models, cycle immediately — don't waste retries.
if (roundRobin && modelAttempt < maxModelAttempts - 1) {
roundRobin.release(task.id);
modelAttempt++;
ctx.ui.notify(
`Task ${task.id}: model failed, trying next (${modelAttempt + 1}/${maxModelAttempts}): ${result.error}`,
"warning",
);
break; // exit retry loop, cycle to next model
}
// No more models — use normal retry logic
if (retries < maxRetries) { if (retries < maxRetries) {
retries = progress.incrementRetry(task.id); retries = progress.incrementRetry(task.id);
ctx.ui.notify( ctx.ui.notify(
@@ -578,11 +610,20 @@ async function executeTask(
throw new Error(`Task ${task.id} failed: ${result.error}`); throw new Error(`Task ${task.id} failed: ${result.error}`);
} }
} catch (error) { } catch (error) {
roundRobin?.release(task.id);
const errorMsg = error instanceof Error ? error.message : String(error); const errorMsg = error instanceof Error ? error.message : String(error);
progress.markFailed(task.id, errorMsg); progress.markFailed(task.id, errorMsg);
throw error; throw error;
} }
} }
// If we broke out (model cycling), continue the outer loop
modelAttempt++;
}
// All models exhausted
progress.markFailed(task.id, "All configured models exhausted");
throw new Error(`Task ${task.id} failed: all configured models exhausted`);
} }
// ─── Save Reflection to File ──────────────────────────────────────────────── // ─── Save Reflection to File ────────────────────────────────────────────────