UNPKG

wayback.js

Version:

The Wayback Availability JSON API in JavaScript.

199 lines (163 loc) 5.93 kB
class Wayback { #baseApiUrl = 'https://archive.org/wayback/available'; #baseSaveUrl = 'https://web.archive.org/save/_embed/'; #connectionTimeoutMs; #headers; #cache = new Map(); #cacheTTL; #gcInterval; #cleanupInterval; static #registry = new FinalizationRegistry((cleanupInterval) => { clearInterval(cleanupInterval); }); constructor({ connectionTimeoutMs = 9000, cacheTTL = 86400, gcInterval = 3600, headers = undefined } = {}) { this.#connectionTimeoutMs = connectionTimeoutMs; this.#headers = headers; this.#cacheTTL = cacheTTL ? Math.max(parseInt(cacheTTL) * 1000, 3600 * 1000) : 0; this.#gcInterval = Math.max(parseInt(gcInterval) * 1000, 3600 * 1000); this.#cleanupInterval = setInterval(() => this.#cleanupCache(), this.#gcInterval).unref(); Wayback.#registry.register(this, this.#cleanupInterval, this); } destroy() { clearInterval(this.#cleanupInterval); this.#cache.clear(); Wayback.#registry.unregister(this); } async isArchived(url, resolveRedirects = true) { const cachedEntry = this.#getCache(url); if (cachedEntry) { return cachedEntry; } try { if (resolveRedirects) { url = await this.getFinalRedirectUrl(url); } const response = await this.#fetch(`${this.#baseApiUrl}?timestamp=${Wayback.currentTimestamp()}&url=${encodeURIComponent(url.replace(/^https?:\/\//, ''))}`, { method: 'GET' }); if (!response) { return null; } if (response.ok) { const data = await response.json(); const closest = data.archived_snapshots?.closest || null; if (closest?.url && closest?.timestamp) { const snapshot = { url: closest.url, timestamp: closest.timestamp }; this.#setCache(url, snapshot); return snapshot; } } return response.status; } catch (e) { console.error(e); } return null; } async saveUrl(url) { try { const response = await this.#fetch(`${this.#baseSaveUrl}${url}`, { method: 'GET', redirect: 'manual' }); if (!response) { return null; } if (response.ok || response.status === 301 || response.status === 302) { const archiveUrl = response.headers.get('Location'); if (archiveUrl) { const timestamp = archiveUrl.match(/(\d{14})/); const formattedTimestamp = timestamp ? timestamp[0] : Wayback.currentTimestamp(); const snapshot = { url: archiveUrl, timestamp: formattedTimestamp }; this.#setCache(url, snapshot); return snapshot; } } return response.status; } catch (e) { console.error(e); } return null; } async saveOutdatedUrl(url, maxAgeDays = 30, resolveRedirects = true) { const snapshot = await this.isArchived(url, resolveRedirects); if (!snapshot || !snapshot.timestamp) { return this.saveUrl(url); } try { const lastArchived = new Date( snapshot.timestamp.substring(0, 4) + '-' + snapshot.timestamp.substring(4, 6) + '-' + snapshot.timestamp.substring(6, 8) ); const diffDays = (Date.now() - lastArchived.getTime()) / (1000 * 60 * 60 * 24); if (diffDays > maxAgeDays) { return this.saveUrl(url); } return snapshot; } catch (e) { console.error(e); } return this.saveUrl(url); } async getFinalRedirectUrl(url) { try { const response = await this.#fetch(url, { method: 'HEAD', redirect: 'follow' }); return response?.url || url; } catch (e) { console.error(e); } return url; } static currentTimestamp() { return new Date(Date.now() - 1000).toISOString().replace(/[-:T]/g, '').split('.')[0]; } async #fetch(url, options = {}) { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), this.#connectionTimeoutMs); try { const response = await fetch(url, { ...options, headers: this.#headers, signal: controller.signal }); clearTimeout(timeoutId); return response; } catch (e) { clearTimeout(timeoutId); if (e.name !== 'AbortError' && e.name !== 'TypeError') { throw e; } console.warn(`${url} - ${e.message}`); } return null; } #getCache(url) { if (this.#cacheTTL) { const entry = this.#cache.get(url); if (entry) { if ((Date.now() - entry.cacheTimestamp) >= this.#cacheTTL) { this.#cache.delete(url); } else { return entry.data; } } } return undefined; } #setCache(url, snapshot) { if (this.#cacheTTL) { this.#cache.set(url, { data: snapshot, cacheTimestamp: Date.now() }); } } #cleanupCache() { if (this.#cacheTTL) { const now = Date.now(); for (const [url, entry] of this.#cache) { if (now - entry.cacheTimestamp >= this.#cacheTTL) { this.#cache.delete(url); } } } } } export default Wayback; export { Wayback };