UNPKG

web-archive

Version:

An utility that uses archive.is / archive.today to archive a webpage from a given url.

70 lines (59 loc) 1.98 kB
const UserAgent = require('user-agents'); const puppeteer = require('puppeteer-extra'); const StealthPlugin = require('puppeteer-extra-plugin-stealth'); puppeteer.use(StealthPlugin()); const ARCHIVE_URL = "http://archive.vn" const DEFAULT_TIMEOUT = 60 * 1000; // One Minute default timeout async function save(req_url, options={}) { const { forceRefresh = false, userAgent = new UserAgent().toString(), archiveSite = ARCHIVE_URL, timeout = DEFAULT_TIMEOUT } = options; const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.setUserAgent(userAgent); // Set a realistic User-Agent let error; let req = await new Promise(async (resolve, reject) => { let timed_out = setTimeout(() => reject('Request Timed Out'), timeout) try { await page.setRequestInterception(true); page.on('request', request => { request.continue(); if (/\/wip\/(.+)/.test(request.url())) { // If WIP, URL is archiving clearTimeout(timed_out); resolve(request.url().replace('/wip', '')); } else if (/\/\/.+\.[^\/]+\/(.{5})\/?$/.test(request.url())) { // Test if URL already archived clearTimeout(timed_out); resolve(request.url()); } }); await page.goto(archiveSite, { waitUntil: 'networkidle2' }); if (forceRefresh) { await page.evaluate(() => { const input = document.createElement('input'); input.type = "hidden"; input.id = input.name = 'anyway'; input.value = '1'; document.querySelector('form#submiturl').prepend(input); }); } await page.type('input#url', req_url); await page.click('form#submiturl input[type="submit"]'); await page.waitForNavigation({ waitUntil: 'networkidle2' }); } catch (error) { clearTimeout(timed_out); reject(error); } }).catch(e => (error=e)); await browser.close(); // Throw if unable to archive if (error) { throw new Error(error) } return req; } module.exports = {save};