UNPKG

@akira108sys/html-rewriter-readability

Version:

A library to extract readable content with Mozilla/Readability algorithm using Cloudflare HTMLRewriter.

216 lines (215 loc) 10.9 kB
// phase1-handler.ts import { VOID_ELEMENTS } from './constants'; // Import from constants.ts import { extractMetadataFromElement, unescapeHtmlEntities } from './utils'; // Import from utils.ts export class Phase1Handler { /** * Initialize Phase1Handler. * @param elementStore Map to store element information (managed externally) * @param metadataStore Object to store metadata (managed externally) * @param idGenerator Function to generate element IDs (managed externally) */ constructor(elementStore, metadataStore, idGenerator, debugEnabled, maxElemsToParse) { this.elementStack = []; // Stack of currently nested element IDs this.elementCount = 0; // Map to store the last text chunk // Key: elementId, Value: last processed (non-whitespace) chunk this.lastTextChunkMap = new Map(); this.elementStore = elementStore; this.metadataStore = metadataStore; this.generateElementId = idGenerator; this.debugEnabled = debugEnabled; this.maxElemsToParse = maxElemsToParse ?? 0; } /** Get the ID of the currently processing element (top of stack) */ getCurrentElementIdFromStack() { return this.elementStack.length > 0 ? this.elementStack[this.elementStack.length - 1] : null; } /** Process document end */ end(end) { console.log("Phase 1: HTML parsing finished (End Handler)."); // Check for unclosed elements if stack is not empty if (this.elementStack.length > 0) { console.warn(`Phase 1 End: Element stack is not empty: [${this.elementStack.join(', ')}]`); } // Clear the Map this.lastTextChunkMap.clear(); console.log("Phase 1: lastTextChunkMap cleared."); } // --- Element Handler --- /** Process element start tag */ element(element) { if (this.maxElemsToParse > 0 && this.elementCount >= this.maxElemsToParse) { if (this.debugEnabled) { console.log(`[MAX_ELEMS] Reached max elements to parse (${this.maxElemsToParse}). Stopping.`); } return; } this.elementCount++; const tagName = element.tagName.toUpperCase(); if (this.debugEnabled) console.log(`Phase1: [START] <${tagName}>`); // --- Skip unnecessary elements --- if (tagName === "SCRIPT" || tagName === "STYLE" || tagName === "NOSCRIPT" || tagName === "IFRAME" || (tagName === 'LINK' && element.getAttribute('rel') === 'stylesheet')) { if (this.debugEnabled) console.log(`Phase1: [SKIP] <${tagName}>`); return; // Exit handler (onEndTag won't be registered) } // --- Prepare and store element information --- const elementId = this.generateElementId(); const parentId = this.getCurrentElementIdFromStack(); // Parent is top of stack const attributes = {}; for (const [key, value] of element.attributes) { attributes[key.toLowerCase()] = value; } const isVisibleBasedOnAttrs = element.getAttribute("hidden") === null && attributes.style?.includes('display: none') !== true && attributes.style?.includes('visibility: hidden') !== true && element.getAttribute("aria-hidden") !== "true"; const role = element.getAttribute("role"); // Store information in elementStore this.elementStore.set(elementId, { id: elementId, parentId: parentId, tagName: tagName, attributes: attributes, textChunks: [], // Initialize chunks array finalTextContent: "", // Initialize final text isVisibleBasedOnAttrs: isVisibleBasedOnAttrs, role: role ?? null, isDataTableLikely: (tagName === 'TABLE' && attributes.role !== 'presentation' && attributes.datatable !== '0'), isCodeBlock: (tagName === 'PRE'), }); // Delete entry from Map when new element starts (to avoid comparing with previous element's chunks) this.lastTextChunkMap.delete(elementId); // --- Process specific tags (attribute changes and metadata collection) --- if (tagName === 'IMG' || tagName === 'PICTURE' || tagName === 'FIGURE') { const src = element.getAttribute('src'); const srcset = element.getAttribute('srcset'); const dataSrc = element.getAttribute('data-src'); const dataSrcset = element.getAttribute('data-srcset'); if (!src && dataSrc) { element.setAttribute('src', dataSrc); element.removeAttribute('data-src'); } if (!srcset && dataSrcset) { element.setAttribute('srcset', dataSrcset); element.removeAttribute('data-srcset'); } } else if (tagName === 'META') { const extractedMeta = extractMetadataFromElement(element); for (const key in extractedMeta) { const metaKey = key; if (!this.metadataStore[metaKey]) { this.metadataStore[metaKey] = unescapeHtmlEntities(extractedMeta[metaKey]); } } } else if (tagName === 'HTML') { this.metadataStore.lang = element.getAttribute('lang') ?? undefined; this.metadataStore.dir = element.getAttribute('dir') ?? undefined; } // --- Register stack and onEndTag (except for void elements) --- if (!VOID_ELEMENTS.has(tagName)) { // Push current element ID to stack this.elementStack.push(elementId); try { // Register end tag handler element.onEndTag(() => { if (this.elementStack.length === 0) { console.error(`Phase1: EndTag Error: Stack empty when processing </${tagName}>`); return; } const finishedElementId = this.elementStack.pop(); if (finishedElementId === undefined) return; // Delete from Map when finished this.lastTextChunkMap.delete(finishedElementId); const info = this.elementStore.get(finishedElementId); if (info) { // Combine text chunks and store in finalTextContent info.finalTextContent = (info.textChunks ?? []).join(''); info.textChunks = undefined; // Remove unnecessary chunk array if (this.debugEnabled) console.log(`Phase1: EndTag </${tagName}>#${finishedElementId}. Final text: "${info.finalTextContent.substring(0, 50)}..."`); if (info.tagName === 'TITLE' && !this.metadataStore.title) { this.metadataStore.title = unescapeHtmlEntities(info.finalTextContent.trim()); } } else { console.error(`Phase1: EndTag Error: ElementInfo not found for ID ${finishedElementId} (</${tagName}>)`); } }); if (this.debugEnabled) console.log(`Phase1: [END] <${tagName}>#${elementId}, onEndTag registered`); } catch (error) { console.warn(`Phase1: Failed to register onEndTag for <${tagName}>#${elementId}:`, error); // If registration fails, remove from stack and Map if (this.elementStack.length > 0 && this.elementStack[this.elementStack.length - 1] === elementId) { this.lastTextChunkMap.delete(elementId); this.elementStack.pop(); } if (this.debugEnabled) console.log(`Phase1: [END] <${tagName}>#${elementId}, onEndTag registration failed`); } } else { // --- Process void elements --- if (this.debugEnabled) console.log(`Phase1: [END] Void element <${tagName}>#${elementId}`); // Delete from Map for void elements too this.lastTextChunkMap.delete(elementId); // Remove textChunks from void element info const voidInfo = this.elementStore.get(elementId); if (voidInfo) voidInfo.textChunks = undefined; } } /** Text chunk processing */ text(text) { const currentElementId = this.getCurrentElementIdFromStack(); const newChunk = text.text; // Original text const significantText = newChunk.trim().length > 0; if (this.debugEnabled) console.log(`Phase1 Text Handler: currentElementId=${currentElementId}, hasSignificantText=${significantText}, chunk="${newChunk.substring(0, 50).replace(/\n/g, '\\n')}..."`); if (currentElementId !== null && significantText) { const currentInfo = this.elementStore.get(currentElementId); if (currentInfo && !VOID_ELEMENTS.has(currentInfo.tagName)) { // ★★★ Duplicate Check ★★★ const lastChunk = this.lastTextChunkMap.get(currentElementId); if (lastChunk !== newChunk) { // Add only if different from the previous chunk if (!currentInfo.textChunks) currentInfo.textChunks = []; currentInfo.textChunks.push(newChunk); if (this.debugEnabled) console.log(` -> Added to ${currentInfo.tagName}#${currentElementId}`); // ★ Record the last added chunk (only if not whitespace) this.lastTextChunkMap.set(currentElementId, newChunk); } else { if (this.debugEnabled) console.log(` -> Skipped duplicate chunk for ${currentInfo.tagName}#${currentElementId}`); } } else if (currentInfo) { if (this.debugEnabled) console.log(` -> Ignored for void element: ${currentInfo.tagName}#${currentElementId}`); } else { if (this.debugEnabled) console.log(` -> Ignored because currentInfo not found for ID: ${currentElementId}`); } } else if (currentElementId === null) { if (this.debugEnabled) console.log(" -> Ignored because stack is empty."); } else { if (this.debugEnabled) console.log(" -> Ignored because chunk is whitespace only."); } } }