#!/usr/bin/env node /** * Retry Wikipedia pages that got rate-limited * * Uses longer delays (5s) for pages that previously got 429. */ import "dotenv/config"; import { closeDb } from "../src/lib/db/index"; import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs"; import { resolve, dirname } from "path"; import { fileURLToPath } from "url"; const __filedir = dirname(fileURLToPath(import.meta.url)); function cacheGet(k: string): string | null { const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json"); return existsSync(p) ? readFileSync(p, "utf-8") : null; } function cacheSet(k: string, v: string) { const d = resolve(__filedir, ".scraper-cache"); if (!existsSync(d)) mkdirSync(d, { recursive: true }); writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8"); } const PAGES_TO_RETRY = [ "List_of_cranberry_diseases", "List_of_cucurbit_diseases", "List_of_grape_diseases", "List_of_hops_diseases", "List_of_rice_diseases", "List_of_rose_diseases", "List_of_sorghum_diseases", "List_of_soybean_diseases", "List_of_spinach_diseases", "List_of_strawberry_diseases", "List_of_sugarcane_diseases", "List_of_sunflower_diseases", "List_of_sweet_potato_diseases", ]; async function fetchWT(page: string): Promise { const key = `wt-${page}`; const c = cacheGet(key); if (c) return c; const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`; const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } }); if (!r.ok) throw new Error(`HTTP ${r.status}`); const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } }; if (d.error) throw new Error(d.error.info); cacheSet(key, d.parse.wikitext); return d.parse.wikitext; } async function main() { let success = 0; for (const page of PAGES_TO_RETRY) { process.stdout.write(`📋 ${page}... `); try { await new Promise((r) => setTimeout(r, 5000 + Math.random() * 2000)); const wt = await fetchWT(page); console.log(`✅ ${wt.length} bytes`); success++; } catch (e) { console.log(`❌ ${e instanceof Error ? e.message : e}`); } } await new Promise((r) => setTimeout(r, 2000)); console.log(`\nDone. ${success}/${PAGES_TO_RETRY.length} pages fetched`); closeDb(); } main().catch(console.error);