establish db
This commit is contained in:
71
apps/web/scripts/retry-wiki.ts
Normal file
71
apps/web/scripts/retry-wiki.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Retry Wikipedia pages that got rate-limited
|
||||
*
|
||||
* Uses longer delays (5s) for pages that previously got 429.
|
||||
*/
|
||||
import "dotenv/config";
|
||||
import { closeDb } from "../src/lib/db/index";
|
||||
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs";
|
||||
import { resolve, dirname } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
|
||||
const __filedir = dirname(fileURLToPath(import.meta.url));
|
||||
function cacheGet(k: string): string | null {
|
||||
const p = resolve(__filedir, ".scraper-cache", encodeURIComponent(k) + ".json");
|
||||
return existsSync(p) ? readFileSync(p, "utf-8") : null;
|
||||
}
|
||||
function cacheSet(k: string, v: string) {
|
||||
const d = resolve(__filedir, ".scraper-cache");
|
||||
if (!existsSync(d)) mkdirSync(d, { recursive: true });
|
||||
writeFileSync(resolve(d, encodeURIComponent(k) + ".json"), v, "utf-8");
|
||||
}
|
||||
|
||||
const PAGES_TO_RETRY = [
|
||||
"List_of_cranberry_diseases",
|
||||
"List_of_cucurbit_diseases",
|
||||
"List_of_grape_diseases",
|
||||
"List_of_hops_diseases",
|
||||
"List_of_rice_diseases",
|
||||
"List_of_rose_diseases",
|
||||
"List_of_sorghum_diseases",
|
||||
"List_of_soybean_diseases",
|
||||
"List_of_spinach_diseases",
|
||||
"List_of_strawberry_diseases",
|
||||
"List_of_sugarcane_diseases",
|
||||
"List_of_sunflower_diseases",
|
||||
"List_of_sweet_potato_diseases",
|
||||
];
|
||||
|
||||
async function fetchWT(page: string): Promise<string> {
|
||||
const key = `wt-${page}`;
|
||||
const c = cacheGet(key);
|
||||
if (c) return c;
|
||||
const url = `https://en.wikipedia.org/w/api.php?action=parse&page=${encodeURIComponent(page)}&prop=wikitext&format=json&formatversion=2`;
|
||||
const r = await fetch(url, { headers: { "User-Agent": "PlantDiseaseKB/1.0 (research)" } });
|
||||
if (!r.ok) throw new Error(`HTTP ${r.status}`);
|
||||
const d = (await r.json()) as { parse: { wikitext: string }; error?: { info: string } };
|
||||
if (d.error) throw new Error(d.error.info);
|
||||
cacheSet(key, d.parse.wikitext);
|
||||
return d.parse.wikitext;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
let success = 0;
|
||||
for (const page of PAGES_TO_RETRY) {
|
||||
process.stdout.write(`📋 ${page}... `);
|
||||
try {
|
||||
await new Promise((r) => setTimeout(r, 5000 + Math.random() * 2000));
|
||||
const wt = await fetchWT(page);
|
||||
console.log(`✅ ${wt.length} bytes`);
|
||||
success++;
|
||||
} catch (e) {
|
||||
console.log(`❌ ${e instanceof Error ? e.message : e}`);
|
||||
}
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
console.log(`\nDone. ${success}/${PAGES_TO_RETRY.length} pages fetched`);
|
||||
closeDb();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user