You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
2.6 KiB
TypeScript

import './index.d.ts'
import { url2title, getPageRecord, writeRecord } from './analyser/toolkit.ts'
import { requestMetricsRun, checkStatus, retrieveMetrics } from './analyser/metrics.ts'
const INPUT_FILE = Deno.args[0] ?? './pages.txt'
const OUTPUT_PATH = Deno.args[1] ?? './content' // results are written here
const RECHECK_THRESHOLD = 60*60*24*7*1000 // recheck pages older than 1 week
const REJECT_THRESHOLD = 262144 // 256kb (duh)
async function getPageList (): Promise<string[]> {
const inputContent = await Deno.readTextFile(INPUT_FILE)
return inputContent.split('\n').filter(line => line.startsWith('http'))
}
const now = Date.now()
const pages = await getPageList()
const pagesToUpdate: string[] = []
pages.forEach(async (url) => {
const record = await getPageRecord(url, OUTPUT_PATH)
const lastUpdated = Date.parse(record?.updated || '')
const needsCheck = !record || (now - lastUpdated) > RECHECK_THRESHOLD
if (!needsCheck) {
console.log(url, 'is up-to-date')
return
}
const runId = await requestMetricsRun(url)
if (runId) {
console.log(url, 'new or outdated, runId is', runId)
pagesToUpdate.push(runId)
}
})
async function updateRecords () {
if (pagesToUpdate.length === 0) return // done, yeah!
const runId = pagesToUpdate.at(-1) || '' // make tsc happy
const { url, status } = await checkStatus(runId)
// TODO: handle failures more gracefully
if (status === 'failed') {
pagesToUpdate.pop()
console.log(url, 'analysis failed')
} else if (status === 'complete') {
pagesToUpdate.pop()
const oldRecord = await getPageRecord(url, OUTPUT_PATH)
const metrics = await retrieveMetrics(runId)
if (metrics) {
// poor mans toISODateString
const now = (new Date()).toISOString().split('T')[0]
const weight = metrics.metrics.contentLength
const ratio = Math.round((metrics.metrics.htmlSize / weight) * 100)
if (weight > REJECT_THRESHOLD) {
console.log(url, 'is not allowed in, weighs', Math.round(weight / 1024), 'kb')
}
const record: PageRecord = {
title: url2title(url),
date: oldRecord === null ? now : oldRecord.date,
updated: now,
weight,
extra: {
source: url,
ratio,
size: Math.round(weight / 1024)
}
}
// TODO: check success
await writeRecord(record, url, OUTPUT_PATH)
console.log(url, 'updated')
} else {
console.error('failed to retrieve results for', url, runId)
}
}
setTimeout(() => updateRecords(), 500) // run again until the list is empty
}
setTimeout(() => updateRecords(), 1000)