automatic failover

This commit is contained in:
2026-05-31 02:01:37 -04:00
parent 925e37938b
commit d2ef124369
2 changed files with 103 additions and 58 deletions

View File

@@ -122,6 +122,10 @@ prompts:
> tasks, only the first two models are used. The third model is only touched when
> a third concurrent task starts. Freed model slots are reused before new ones
> are allocated.
>
> **Automatic failover**: if a provider/API is unreachable (rate limit, 503, etc.),
> the task automatically cycles to the next model in the list without counting it
> as a task failure. Each model is tried once before the task is marked as failed.
The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`.

View File

@@ -54,6 +54,10 @@ class ModelRoundRobin {
this.freeSlots = [];
}
get length(): number {
return this.models.length;
}
assign(taskId: string): unknown {
let index: number;
if (this.freeSlots.length > 0) {
@@ -385,8 +389,8 @@ export async function executeBatch(
projectDir,
undefined,
model,
roundRobin,
);
roundRobin?.release(task.id);
}
}
@@ -483,7 +487,8 @@ async function executeBatchParallel(
projectDir,
sharedState,
assignedModel,
).finally(() => roundRobin?.release(task.id)),
roundRobin,
),
});
// Limit concurrency
@@ -514,10 +519,24 @@ async function executeTask(
projectDir: string = project.sourceDir,
parallelState?: ParallelWidgetState,
assignedModel?: unknown,
roundRobin?: ModelRoundRobin | null,
): Promise<void> {
const maxRetries = config.execution.maxRetries;
let retries = 0;
// Model failover: when a provider/API is down, cycle through available models.
// result.success === false always means an agent-session failure (API error,
// provider unreachable, etc.), not a task-work error.
const maxModelAttempts = roundRobin ? roundRobin.length : 1;
let modelAttempt = 0;
let currentModel: unknown = assignedModel ?? config.model;
while (modelAttempt < maxModelAttempts) {
// Get the next model from round-robin (on first try, use the pre-assigned model)
if (modelAttempt > 0 && roundRobin) {
currentModel = roundRobin.assign(task.id);
}
let retries = 0;
while (retries <= maxRetries) {
try {
// Mark as in progress
@@ -538,7 +557,7 @@ async function executeTask(
sendChatMessage,
projectDir,
parallelState,
assignedModel,
currentModel,
);
if (result.success) {
@@ -558,10 +577,23 @@ async function executeTask(
result.commitMessages,
result.commitSummary,
);
roundRobin?.release(task.id);
return;
}
// Task failed, check if we should retry
// Agent session failed (provider error).
// If we have more models, cycle immediately — don't waste retries.
if (roundRobin && modelAttempt < maxModelAttempts - 1) {
roundRobin.release(task.id);
modelAttempt++;
ctx.ui.notify(
`Task ${task.id}: model failed, trying next (${modelAttempt + 1}/${maxModelAttempts}): ${result.error}`,
"warning",
);
break; // exit retry loop, cycle to next model
}
// No more models — use normal retry logic
if (retries < maxRetries) {
retries = progress.incrementRetry(task.id);
ctx.ui.notify(
@@ -578,11 +610,20 @@ async function executeTask(
throw new Error(`Task ${task.id} failed: ${result.error}`);
}
} catch (error) {
roundRobin?.release(task.id);
const errorMsg = error instanceof Error ? error.message : String(error);
progress.markFailed(task.id, errorMsg);
throw error;
}
}
// If we broke out (model cycling), continue the outer loop
modelAttempt++;
}
// All models exhausted
progress.markFailed(task.id, "All configured models exhausted");
throw new Error(`Task ${task.id} failed: all configured models exhausted`);
}
// ─── Save Reflection to File ────────────────────────────────────────────────