Files
plant-disease-id/apps/web/scripts/retry-wiki.ts
2026-06-05 20:30:28 -04:00

72 lines
2.4 KiB
JavaScript

#!/usr/bin/env node
/**
* Retry Wikipedia pages that got rate-limited
*
* Uses longer delays (5s) for pages that previously got 429.
*/
import "dotenv/config";
import { closeDb } from "../src/lib/db/index";
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
import { resolve, dirname } from "path";
import { fileURLToPath } from "url";
const __filedir = dirname(fileURLToPath(import.meta.url));
function cacheGet(k: string): string | null {
const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json");
return existsSync(p) ? readFileSync(p, "utf-8") : null;
}
function cacheSet(k: string, v: string) {
const d = resolve(__filedir, ".scraper-cache");
if (!existsSync(d)) mkdirSync(d, { recursive: true });
writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8");
}
const PAGES_TO_RETRY = [
"List_of_cranberry_diseases",
"List_of_cucurbit_diseases",
"List_of_grape_diseases",
"List_of_hops_diseases",
"List_of_rice_diseases",
"List_of_rose_diseases",
"List_of_sorghum_diseases",
"List_of_soybean_diseases",
"List_of_spinach_diseases",
"List_of_strawberry_diseases",
"List_of_sugarcane_diseases",
"List_of_sunflower_diseases",
"List_of_sweet_potato_diseases",
];
async function fetchWT(page: string): Promise<string> {
const key = `wt-${page}`;
const c = cacheGet(key);
if (c) return c;
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`;
const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
if (!r.ok) throw new Error(`HTTP ${r.status}`);
const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } };
if (d.error) throw new Error(d.error.info);
cacheSet(key, d.parse.wikitext);
return d.parse.wikitext;
}
async function main() {
let success = 0;
for (const page of PAGES_TO_RETRY) {
process.stdout.write(`📋 ${page}... `);
try {
await new Promise((r) => setTimeout(r, 5000 + Math.random() * 2000));
const wt = await fetchWT(page);
console.log(`${wt.length} bytes`);
success++;
} catch (e) {
console.log(`${e instanceof Error ? e.message : e}`);
}
}
await new Promise((r) => setTimeout(r, 2000));
console.log(`\nDone. ${success}/${PAGES_TO_RETRY.length} pages fetched`);
closeDb();
}
main().catch(console.error);