72 lines
2.4 KiB
JavaScript
72 lines
2.4 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Retry Wikipedia pages that got rate-limited
|
|
*
|
|
* Uses longer delays (5s) for pages that previously got 429.
|
|
*/
|
|
import "dotenv/config";
|
|
import { closeDb } from "../src/lib/db/index";
|
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
|
import { resolve, dirname } from "path";
|
|
import { fileURLToPath } from "url";
|
|
|
|
const __filedir = dirname(fileURLToPath(import.meta.url));
|
|
function cacheGet(k: string): string | null {
|
|
const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json");
|
|
return existsSync(p) ? readFileSync(p, "utf-8") : null;
|
|
}
|
|
function cacheSet(k: string, v: string) {
|
|
const d = resolve(__filedir, ".scraper-cache");
|
|
if (!existsSync(d)) mkdirSync(d, { recursive: true });
|
|
writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8");
|
|
}
|
|
|
|
const PAGES_TO_RETRY = [
|
|
"List_of_cranberry_diseases",
|
|
"List_of_cucurbit_diseases",
|
|
"List_of_grape_diseases",
|
|
"List_of_hops_diseases",
|
|
"List_of_rice_diseases",
|
|
"List_of_rose_diseases",
|
|
"List_of_sorghum_diseases",
|
|
"List_of_soybean_diseases",
|
|
"List_of_spinach_diseases",
|
|
"List_of_strawberry_diseases",
|
|
"List_of_sugarcane_diseases",
|
|
"List_of_sunflower_diseases",
|
|
"List_of_sweet_potato_diseases",
|
|
];
|
|
|
|
async function fetchWT(page: string): Promise<string> {
|
|
const key = `wt-${page}`;
|
|
const c = cacheGet(key);
|
|
if (c) return c;
|
|
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`;
|
|
const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
|
|
if (!r.ok) throw new Error(`HTTP ${r.status}`);
|
|
const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } };
|
|
if (d.error) throw new Error(d.error.info);
|
|
cacheSet(key, d.parse.wikitext);
|
|
return d.parse.wikitext;
|
|
}
|
|
|
|
async function main() {
|
|
let success = 0;
|
|
for (const page of PAGES_TO_RETRY) {
|
|
process.stdout.write(`📋 ${page}... `);
|
|
try {
|
|
await new Promise((r) => setTimeout(r, 5000 + Math.random() * 2000));
|
|
const wt = await fetchWT(page);
|
|
console.log(`✅ ${wt.length} bytes`);
|
|
success++;
|
|
} catch (e) {
|
|
console.log(`❌ ${e instanceof Error ? e.message : e}`);
|
|
}
|
|
}
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
console.log(`\nDone. ${success}/${PAGES_TO_RETRY.length} pages fetched`);
|
|
closeDb();
|
|
}
|
|
|
|
main().catch(console.error);
|