UNPKG

@akira108sys/html-rewriter-readability

Version:

A library to extract readable content with Mozilla/Readability algorithm using Cloudflare HTMLRewriter.

97 lines (96 loc) 4.08 kB
import { MarkdownConverter } from './markdown-converter'; import { Phase1Handler } from './phase1-handler'; import { calculateScoresAndFindBestCandidate } from './phase2-scorer'; import { getParentId } from './utils'; export class HtmlRewriterReadability { constructor(baseURI, options) { if (typeof baseURI === 'string') { try { this.baseURI = new URL(baseURI); } catch (e) { console.error("Invalid baseURI provided:", baseURI, e); throw new Error(`Invalid baseURI provided: ${baseURI}`); } } else if (baseURI instanceof URL) { this.baseURI = baseURI; } else { throw new Error("baseURI must be a string or URL object."); } this.options = { ...HtmlRewriterReadability.defaultOptions, ...options }; this.elementStore = new Map(); this.elementsToKeepIdsSet = new Set(); this.metadataStore = {}; this.elementCounter = 0; } resetState() { this.elementStore.clear(); this.elementsToKeepIdsSet.clear(); this.metadataStore = {}; this.elementCounter = 0; } async runPhase1(response) { if (this.options.debug) console.log("Phase 1: Parsing HTML and gathering initial data..."); this.resetState(); const phase1Handler = new Phase1Handler(this.elementStore, this.metadataStore, () => ++this.elementCounter, this.options.debug ?? false, this.options.maxElemsToParse); const rewriter = new HTMLRewriter() .on("*", phase1Handler) .onDocument(phase1Handler); const responseClone = response.clone(); await rewriter.transform(responseClone).text(); if (this.options.debug) console.log("Phase 1 Completed. Element count:", this.elementStore.size); if (this.options.debug) console.log("Collected Metadata:", this.metadataStore); } runPhase2() { if (this.options.debug) console.log("Phase 2: Scoring elements..."); const result = calculateScoresAndFindBestCandidate(this.elementStore, { debug: this.options.debug ?? false, nbTopCandidates: this.options.nbTopCandidates, charThreshold: this.options.charThreshold, allowedVideoRegex: this.options.allowedVideoRegex, linkDensityModifier: this.options.linkDensityModifier, }); if (this.options.debug) console.log(`Phase 2 Completed. Top Candidate: ${result.topCandidateId}. Elements to keep: ${result.elementsToKeepIds.length}`); return result; } convertToMarkdown(rootElementId) { if (rootElementId === null) { if (this.options.debug) console.error("Cannot generate Markdown: Root element ID is null."); return ""; } const converter = new MarkdownConverter(this.elementStore, this.elementsToKeepIdsSet, this.baseURI, { debug: this.options.debug }); return converter.convert(rootElementId); } async process(response) { await this.runPhase1(response); const { topCandidateId, elementsToKeepIds } = this.runPhase2(); if (!topCandidateId || elementsToKeepIds.length === 0) { if (this.options.debug) console.warn("Failed to extract readable content."); return null; } this.elementsToKeepIdsSet = new Set(elementsToKeepIds); const rootBuildId = getParentId(topCandidateId, this.elementStore) ?? topCandidateId; // Use parent as starting point const markdown = this.convertToMarkdown(rootBuildId); ; return { markdown, metadata: this.metadataStore }; } } HtmlRewriterReadability.defaultOptions = { debug: false, maxElemsToParse: 0, nbTopCandidates: 5, charThreshold: 500, classesToPreserve: [], keepClasses: false, allowedVideoRegex: /(www\.youtube\.com|player\.vimeo\.com)/i, linkDensityModifier: 0, };