import './index.d.ts' import { url2title, getPageRecord, writeRecord } from './analyser/toolkit.ts' import { requestMetricsRun, checkStatus, retrieveMetrics } from './analyser/metrics.ts' const INPUT_FILE = Deno.args[0] ?? './pages.txt' const OUTPUT_PATH = Deno.args[1] ?? './content' // results are written here const RECHECK_THRESHOLD = 60*60*24*7*1000 // recheck pages older than 1 week const REJECT_THRESHOLD = 262144 // 256kb (duh) async function getPageList (): Promise { const inputContent = await Deno.readTextFile(INPUT_FILE) return inputContent.split('\n').filter(line => line.startsWith('http')) } const now = Date.now() const pages = await getPageList() const pagesToUpdate: string[] = [] pages.forEach(async (url) => { const record = await getPageRecord(url, OUTPUT_PATH) const lastUpdated = Date.parse(record?.updated || '') const needsCheck = !record || (now - lastUpdated) > RECHECK_THRESHOLD if (!needsCheck) { console.log(url, 'is up-to-date') return } const runId = await requestMetricsRun(url) if (runId) { console.log(url, 'new or outdated, runId is', runId) pagesToUpdate.push(runId) } }) async function updateRecords () { if (pagesToUpdate.length === 0) return // done, yeah! const runId = pagesToUpdate.at(-1) || '' // make tsc happy const { url, status } = await checkStatus(runId) // TODO: handle failures more gracefully if (status === 'failed') { pagesToUpdate.pop() console.log(url, 'analysis failed') } else if (status === 'complete') { pagesToUpdate.pop() const oldRecord = await getPageRecord(url, OUTPUT_PATH) const metrics = await retrieveMetrics(runId) if (metrics) { // poor mans toISODateString const now = (new Date()).toISOString().split('T')[0] const weight = metrics.metrics.contentLength const ratio = Math.round((metrics.metrics.htmlSize / weight) * 100) if (weight > REJECT_THRESHOLD) { console.log(url, 'is not allowed in, weighs', Math.round(weight / 1024), 'kb') } const record: PageRecord = { title: url2title(url), date: oldRecord === null ? now : oldRecord.date, updated: now, weight, extra: { source: url, ratio, size: Math.round(weight / 1024) } } // TODO: check success await writeRecord(record, url, OUTPUT_PATH) console.log(url, 'updated') } else { console.error('failed to retrieve results for', url, runId) } } setTimeout(() => updateRecords(), 500) // run again until the list is empty } setTimeout(() => updateRecords(), 1000)