automatic failover
This commit is contained in:
@@ -122,6 +122,10 @@ prompts:
|
||||
> tasks, only the first two models are used. The third model is only touched when
|
||||
> a third concurrent task starts. Freed model slots are reused before new ones
|
||||
> are allocated.
|
||||
>
|
||||
> **Automatic failover**: if a provider/API is unreachable (rate limit, 503, etc.),
|
||||
> the task automatically cycles to the next model in the list without counting it
|
||||
> as a task failure. Each model is tried once before the task is marked as failed.
|
||||
|
||||
The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`.
|
||||
|
||||
|
||||
@@ -54,6 +54,10 @@ class ModelRoundRobin {
|
||||
this.freeSlots = [];
|
||||
}
|
||||
|
||||
get length(): number {
|
||||
return this.models.length;
|
||||
}
|
||||
|
||||
assign(taskId: string): unknown {
|
||||
let index: number;
|
||||
if (this.freeSlots.length > 0) {
|
||||
@@ -385,8 +389,8 @@ export async function executeBatch(
|
||||
projectDir,
|
||||
undefined,
|
||||
model,
|
||||
roundRobin,
|
||||
);
|
||||
roundRobin?.release(task.id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -483,7 +487,8 @@ async function executeBatchParallel(
|
||||
projectDir,
|
||||
sharedState,
|
||||
assignedModel,
|
||||
).finally(() => roundRobin?.release(task.id)),
|
||||
roundRobin,
|
||||
),
|
||||
});
|
||||
|
||||
// Limit concurrency
|
||||
@@ -514,10 +519,24 @@ async function executeTask(
|
||||
projectDir: string = project.sourceDir,
|
||||
parallelState?: ParallelWidgetState,
|
||||
assignedModel?: unknown,
|
||||
roundRobin?: ModelRoundRobin | null,
|
||||
): Promise<void> {
|
||||
const maxRetries = config.execution.maxRetries;
|
||||
let retries = 0;
|
||||
|
||||
// Model failover: when a provider/API is down, cycle through available models.
|
||||
// result.success === false always means an agent-session failure (API error,
|
||||
// provider unreachable, etc.), not a task-work error.
|
||||
const maxModelAttempts = roundRobin ? roundRobin.length : 1;
|
||||
let modelAttempt = 0;
|
||||
let currentModel: unknown = assignedModel ?? config.model;
|
||||
|
||||
while (modelAttempt < maxModelAttempts) {
|
||||
// Get the next model from round-robin (on first try, use the pre-assigned model)
|
||||
if (modelAttempt > 0 && roundRobin) {
|
||||
currentModel = roundRobin.assign(task.id);
|
||||
}
|
||||
|
||||
let retries = 0;
|
||||
while (retries <= maxRetries) {
|
||||
try {
|
||||
// Mark as in progress
|
||||
@@ -538,7 +557,7 @@ async function executeTask(
|
||||
sendChatMessage,
|
||||
projectDir,
|
||||
parallelState,
|
||||
assignedModel,
|
||||
currentModel,
|
||||
);
|
||||
|
||||
if (result.success) {
|
||||
@@ -558,10 +577,23 @@ async function executeTask(
|
||||
result.commitMessages,
|
||||
result.commitSummary,
|
||||
);
|
||||
roundRobin?.release(task.id);
|
||||
return;
|
||||
}
|
||||
|
||||
// Task failed, check if we should retry
|
||||
// Agent session failed (provider error).
|
||||
// If we have more models, cycle immediately — don't waste retries.
|
||||
if (roundRobin && modelAttempt < maxModelAttempts - 1) {
|
||||
roundRobin.release(task.id);
|
||||
modelAttempt++;
|
||||
ctx.ui.notify(
|
||||
`Task ${task.id}: model failed, trying next (${modelAttempt + 1}/${maxModelAttempts}): ${result.error}`,
|
||||
"warning",
|
||||
);
|
||||
break; // exit retry loop, cycle to next model
|
||||
}
|
||||
|
||||
// No more models — use normal retry logic
|
||||
if (retries < maxRetries) {
|
||||
retries = progress.incrementRetry(task.id);
|
||||
ctx.ui.notify(
|
||||
@@ -578,11 +610,20 @@ async function executeTask(
|
||||
throw new Error(`Task ${task.id} failed: ${result.error}`);
|
||||
}
|
||||
} catch (error) {
|
||||
roundRobin?.release(task.id);
|
||||
const errorMsg = error instanceof Error ? error.message : String(error);
|
||||
progress.markFailed(task.id, errorMsg);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// If we broke out (model cycling), continue the outer loop
|
||||
modelAttempt++;
|
||||
}
|
||||
|
||||
// All models exhausted
|
||||
progress.markFailed(task.id, "All configured models exhausted");
|
||||
throw new Error(`Task ${task.id} failed: all configured models exhausted`);
|
||||
}
|
||||
|
||||
// ─── Save Reflection to File ────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user