automatic failover
This commit is contained in:
@@ -122,6 +122,10 @@ prompts:
|
|||||||
> tasks, only the first two models are used. The third model is only touched when
|
> tasks, only the first two models are used. The third model is only touched when
|
||||||
> a third concurrent task starts. Freed model slots are reused before new ones
|
> a third concurrent task starts. Freed model slots are reused before new ones
|
||||||
> are allocated.
|
> are allocated.
|
||||||
|
>
|
||||||
|
> **Automatic failover**: if a provider/API is unreachable (rate limit, 503, etc.),
|
||||||
|
> the task automatically cycles to the next model in the list without counting it
|
||||||
|
> as a task failure. Each model is tried once before the task is marked as failed.
|
||||||
|
|
||||||
The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`.
|
The keys mirror the nested structure of `RalpiConfig` in `src/types.ts`.
|
||||||
|
|
||||||
|
|||||||
157
src/executor.ts
157
src/executor.ts
@@ -54,6 +54,10 @@ class ModelRoundRobin {
|
|||||||
this.freeSlots = [];
|
this.freeSlots = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get length(): number {
|
||||||
|
return this.models.length;
|
||||||
|
}
|
||||||
|
|
||||||
assign(taskId: string): unknown {
|
assign(taskId: string): unknown {
|
||||||
let index: number;
|
let index: number;
|
||||||
if (this.freeSlots.length > 0) {
|
if (this.freeSlots.length > 0) {
|
||||||
@@ -385,8 +389,8 @@ export async function executeBatch(
|
|||||||
projectDir,
|
projectDir,
|
||||||
undefined,
|
undefined,
|
||||||
model,
|
model,
|
||||||
|
roundRobin,
|
||||||
);
|
);
|
||||||
roundRobin?.release(task.id);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -483,7 +487,8 @@ async function executeBatchParallel(
|
|||||||
projectDir,
|
projectDir,
|
||||||
sharedState,
|
sharedState,
|
||||||
assignedModel,
|
assignedModel,
|
||||||
).finally(() => roundRobin?.release(task.id)),
|
roundRobin,
|
||||||
|
),
|
||||||
});
|
});
|
||||||
|
|
||||||
// Limit concurrency
|
// Limit concurrency
|
||||||
@@ -514,75 +519,111 @@ async function executeTask(
|
|||||||
projectDir: string = project.sourceDir,
|
projectDir: string = project.sourceDir,
|
||||||
parallelState?: ParallelWidgetState,
|
parallelState?: ParallelWidgetState,
|
||||||
assignedModel?: unknown,
|
assignedModel?: unknown,
|
||||||
|
roundRobin?: ModelRoundRobin | null,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const maxRetries = config.execution.maxRetries;
|
const maxRetries = config.execution.maxRetries;
|
||||||
let retries = 0;
|
|
||||||
|
|
||||||
while (retries <= maxRetries) {
|
// Model failover: when a provider/API is down, cycle through available models.
|
||||||
try {
|
// result.success === false always means an agent-session failure (API error,
|
||||||
// Mark as in progress
|
// provider unreachable, etc.), not a task-work error.
|
||||||
progress.markInProgress(task.id);
|
const maxModelAttempts = roundRobin ? roundRobin.length : 1;
|
||||||
|
let modelAttempt = 0;
|
||||||
|
let currentModel: unknown = assignedModel ?? config.model;
|
||||||
|
|
||||||
// Get dependency reflections
|
while (modelAttempt < maxModelAttempts) {
|
||||||
const depReflections = progress.getDependencyReflections(
|
// Get the next model from round-robin (on first try, use the pre-assigned model)
|
||||||
task.dependencies || [],
|
if (modelAttempt > 0 && roundRobin) {
|
||||||
);
|
currentModel = roundRobin.assign(task.id);
|
||||||
|
}
|
||||||
|
|
||||||
// Run the task
|
let retries = 0;
|
||||||
const result = await runTask(
|
while (retries <= maxRetries) {
|
||||||
task,
|
try {
|
||||||
project,
|
// Mark as in progress
|
||||||
config,
|
progress.markInProgress(task.id);
|
||||||
depReflections,
|
|
||||||
ctx,
|
|
||||||
sendChatMessage,
|
|
||||||
projectDir,
|
|
||||||
parallelState,
|
|
||||||
assignedModel,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (result.success) {
|
// Get dependency reflections
|
||||||
// Save reflection
|
const depReflections = progress.getDependencyReflections(
|
||||||
if (result.reflection) {
|
task.dependencies || [],
|
||||||
saveReflectionToFile(projectDir, config, result.reflection);
|
);
|
||||||
|
|
||||||
|
// Run the task
|
||||||
|
const result = await runTask(
|
||||||
|
task,
|
||||||
|
project,
|
||||||
|
config,
|
||||||
|
depReflections,
|
||||||
|
ctx,
|
||||||
|
sendChatMessage,
|
||||||
|
projectDir,
|
||||||
|
parallelState,
|
||||||
|
currentModel,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.success) {
|
||||||
|
// Save reflection
|
||||||
|
if (result.reflection) {
|
||||||
|
saveReflectionToFile(projectDir, config, result.reflection);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark completed with all metadata
|
||||||
|
progress.markCompleted(
|
||||||
|
task.id,
|
||||||
|
result.durationMs,
|
||||||
|
result.reflection,
|
||||||
|
result.toolUsage,
|
||||||
|
result.sessionFile,
|
||||||
|
result.outputPreview,
|
||||||
|
result.commitMessages,
|
||||||
|
result.commitSummary,
|
||||||
|
);
|
||||||
|
roundRobin?.release(task.id);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark completed with all metadata
|
// Agent session failed (provider error).
|
||||||
progress.markCompleted(
|
// If we have more models, cycle immediately — don't waste retries.
|
||||||
task.id,
|
if (roundRobin && modelAttempt < maxModelAttempts - 1) {
|
||||||
result.durationMs,
|
roundRobin.release(task.id);
|
||||||
result.reflection,
|
modelAttempt++;
|
||||||
result.toolUsage,
|
ctx.ui.notify(
|
||||||
result.sessionFile,
|
`Task ${task.id}: model failed, trying next (${modelAttempt + 1}/${maxModelAttempts}): ${result.error}`,
|
||||||
result.outputPreview,
|
"warning",
|
||||||
result.commitMessages,
|
);
|
||||||
result.commitSummary,
|
break; // exit retry loop, cycle to next model
|
||||||
);
|
}
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Task failed, check if we should retry
|
// No more models — use normal retry logic
|
||||||
if (retries < maxRetries) {
|
if (retries < maxRetries) {
|
||||||
retries = progress.incrementRetry(task.id);
|
retries = progress.incrementRetry(task.id);
|
||||||
ctx.ui.notify(
|
ctx.ui.notify(
|
||||||
`Retrying task ${task.id} (${retries}/${maxRetries}): ${result.error}`,
|
`Retrying task ${task.id} (${retries}/${maxRetries}): ${result.error}`,
|
||||||
"warning",
|
"warning",
|
||||||
);
|
);
|
||||||
|
|
||||||
// Exponential backoff
|
// Exponential backoff
|
||||||
const delay = config.execution.retryDelayMs * 2 ** (retries - 1);
|
const delay = config.execution.retryDelayMs * 2 ** (retries - 1);
|
||||||
await sleep(delay);
|
await sleep(delay);
|
||||||
} else {
|
} else {
|
||||||
// Max retries exceeded
|
// Max retries exceeded
|
||||||
progress.markFailed(task.id, result.error || "Unknown error");
|
progress.markFailed(task.id, result.error || "Unknown error");
|
||||||
throw new Error(`Task ${task.id} failed: ${result.error}`);
|
throw new Error(`Task ${task.id} failed: ${result.error}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
roundRobin?.release(task.id);
|
||||||
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
||||||
|
progress.markFailed(task.id, errorMsg);
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
||||||
progress.markFailed(task.id, errorMsg);
|
|
||||||
throw error;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we broke out (model cycling), continue the outer loop
|
||||||
|
modelAttempt++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// All models exhausted
|
||||||
|
progress.markFailed(task.id, "All configured models exhausted");
|
||||||
|
throw new Error(`Task ${task.id} failed: all configured models exhausted`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─── Save Reflection to File ────────────────────────────────────────────────
|
// ─── Save Reflection to File ────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user