You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
3.6 KiB
TypeScript

import "./index.d.ts";
import {
url2title,
getPageRecord,
writeRecord,
removeRecord,
} from "./analyser/toolkit.ts";
import {
requestMetricsRun,
checkStatus,
retrieveMetrics,
} from "./analyser/metrics.ts";
const INPUT_FILE = Deno.args[0] ?? "./pages.txt";
const OUTPUT_PATH = Deno.args[1] ?? "./content"; // results are written here
const RECHECK_THRESHOLD = 60 * 60 * 24 * 7 * 1000; // recheck pages older than 1 week
const REJECT_THRESHOLD = 262144; // 256KB (duh)
const PARALLEL_JOBS = 3; // max YLT jobs
const now = Date.now();
const pages = await getPageList(); // all pages
const pagesUpdating: string[] = []; // currently running ylt jobs
async function getPageList(): Promise<string[]> {
const inputContent = await Deno.readTextFile(INPUT_FILE);
return inputContent.split("\n").filter((line) => line.startsWith("http"));
}
async function updateRecord(runId: string, url: string): Promise<boolean> {
const oldRecord = await getPageRecord(url, OUTPUT_PATH);
const metrics = await retrieveMetrics(runId);
if (!metrics) {
console.error("failed to retrieve results for", url, runId);
return false;
}
// poor mans toISODateString
const now = new Date().toISOString().split("T")[0];
const weight = metrics.metrics.contentLength;
const ratio = Math.round((metrics.metrics.htmlSize / weight) * 100);
if (weight > REJECT_THRESHOLD) {
console.log(url, "rejected! Weighs", Math.round(weight / 1024), "kb");
if (oldRecord) {
removeRecord(url, OUTPUT_PATH).catch(() => {
console.error("Failed to remove old record of rejected url", url);
});
}
return false;
}
const record: PageRecord = {
title: url2title(url),
date: oldRecord === null ? now : oldRecord.date,
updated: now,
weight,
extra: {
source: url,
ratio,
size: Math.round(weight / 1024),
},
};
const success = await writeRecord(record, url, OUTPUT_PATH);
if (success) {
console.log(url, "successfully updated");
} else {
console.error(url, "record could not be written!");
}
}
async function checkPage(url: string) {
const record = await getPageRecord(url, OUTPUT_PATH);
const lastUpdated = Date.parse(record?.updated || "");
const needsCheck = !record || now - lastUpdated > RECHECK_THRESHOLD;
if (!needsCheck) {
console.log(url, "is up-to-date");
return true;
}
const runId = await requestMetricsRun(url);
if (!runId) {
console.error(url, "updating failed!");
return false;
}
console.log(url, "new or outdated, runId is", runId);
return runId;
}
function sleep(duration: number) {
return new Promise((resolve) => {
setTimeout(() => resolve(), duration);
});
}
async function handleBatch() {
if (!pages.length) return; // done, yeah!
const batch = pages.splice(0, PARALLEL_JOBS);
const jobs = batch.map((url) => checkPage(url));
while (jobs.length) {
// take the first job and check
// if the check fails, it will be added back to the end of the list
const runId = await jobs.shift();
// page is up-to-date or YLT has an error
if (runId === true || runId === false) continue;
// TODO: handle failures more gracefully
const { url, status } = await checkStatus(runId);
if (status === "failed") {
console.error(url, "YLT analysis failed");
continue;
} else if (status === "complete") {
console.log(url, "updating record...");
await updateRecord(runId, url);
continue;
} else {
// not done yet, add it back
jobs.push(runId);
// wait a bit before checking again
await sleep(1000);
}
}
handleBatch();
}
handleBatch();