import "./index.d.ts"; import { url2title, getPageRecord, writeRecord, removeRecord, } from "./analyser/toolkit.ts"; import { requestMetricsRun, checkStatus, retrieveMetrics, } from "./analyser/metrics.ts"; const INPUT_FILE = Deno.args[0] ?? "./pages.txt"; const OUTPUT_PATH = Deno.args[1] ?? "./content"; // results are written here const RECHECK_THRESHOLD = 60 * 60 * 24 * 7 * 1000; // recheck pages older than 1 week const REJECT_THRESHOLD = 262144; // 256KB (duh) const PARALLEL_JOBS = 3; // max YLT jobs const now = Date.now(); const pages = await getPageList(); // all pages const pagesUpdating: string[] = []; // currently running ylt jobs async function getPageList(): Promise { const inputContent = await Deno.readTextFile(INPUT_FILE); return inputContent.split("\n").filter((line) => line.startsWith("http")); } async function updateRecord(runId: string, url: string): Promise { const oldRecord = await getPageRecord(url, OUTPUT_PATH); const metrics = await retrieveMetrics(runId); if (!metrics) { console.error("failed to retrieve results for", url, runId); return false; } // poor mans toISODateString const now = new Date().toISOString().split("T")[0]; const weight = metrics.metrics.contentLength; const ratio = Math.round((metrics.metrics.htmlSize / weight) * 100); if (weight > REJECT_THRESHOLD) { console.log(url, "rejected! Weighs", Math.round(weight / 1024), "kb"); if (oldRecord) { removeRecord(url, OUTPUT_PATH).catch(() => { console.error("Failed to remove old record of rejected url", url); }); } return false; } const record: PageRecord = { title: url2title(url), date: oldRecord === null ? now : oldRecord.date, updated: now, weight, extra: { source: url, ratio, size: Math.round(weight / 1024), }, }; const success = await writeRecord(record, url, OUTPUT_PATH); if (success) { console.log(url, "successfully updated"); } else { console.error(url, "record could not be written!"); } } async function checkPage(url: string) { const record = await getPageRecord(url, OUTPUT_PATH); const lastUpdated = Date.parse(record?.updated || ""); const needsCheck = !record || now - lastUpdated > RECHECK_THRESHOLD; if (!needsCheck) { console.log(url, "is up-to-date"); return true; } const runId = await requestMetricsRun(url); if (!runId) { console.error(url, "updating failed!"); return false; } console.log(url, "new or outdated, runId is", runId); return runId; } function sleep(duration: number) { return new Promise((resolve) => { setTimeout(() => resolve(), duration); }); } async function handleBatch() { if (!pages.length) return; // done, yeah! const batch = pages.splice(0, PARALLEL_JOBS); const jobs = batch.map((url) => checkPage(url)); while (jobs.length) { // take the first job and check // if the check fails, it will be added back to the end of the list const runId = await jobs.shift(); // page is up-to-date or YLT has an error if (runId === true || runId === false) continue; // TODO: handle failures more gracefully const { url, status } = await checkStatus(runId); if (status === "failed") { console.error(url, "YLT analysis failed"); continue; } else if (status === "complete") { console.log(url, "updating record..."); await updateRecord(runId, url); continue; } else { // not done yet, add it back jobs.push(runId); // wait a bit before checking again await sleep(1000); } } handleBatch(); } handleBatch();