@akira108sys/html-rewriter-readability
Version:
A library to extract readable content with Mozilla/Readability algorithm using Cloudflare HTMLRewriter.
185 lines (184 loc) • 8.45 kB
JavaScript
import { CHAR_THRESHOLD, COMMAS_REGEX, DEFAULT_TAGS_TO_SCORE, NB_TOP_CANDIDATES } from './constants';
import { getAncestorIds, getChildrenIds, getClassWeight, getDescendantIds, getElementInfo, getInnerText, getLinkDensity, getParentId, isUnlikelyCandidate } from './utils';
function initializeNodeScore(id, store) {
const info = getElementInfo(id, store);
if (!info || info.readability)
return undefined;
info.readability = { contentScore: 0 };
switch (info.tagName) {
case "ARTICLE":
case "DIV":
info.readability.contentScore += 5;
break;
case "PRE":
case "TD":
case "BLOCKQUOTE":
info.readability.contentScore += 3;
break;
case "ADDRESS": /*...*/
info.readability.contentScore -= 3;
break;
case "H1": /*...*/
info.readability.contentScore -= 5;
break;
}
info.readability.contentScore += getClassWeight(id, store);
return info;
}
// --- Main Function ---
export function calculateScoresAndFindBestCandidate(elementStore, options) {
const { debug, nbTopCandidates = NB_TOP_CANDIDATES, charThreshold = CHAR_THRESHOLD, allowedVideoRegex, linkDensityModifier = 0 } = options;
if (debug)
console.log("Starting Phase 2: Scoring...");
if (debug)
console.log("Using options:", { nbTopCandidates, charThreshold, linkDensityModifier });
const candidates = new Map();
const elementsToScoreIds = [];
// 1. Identify elements to score
for (const id of elementStore.keys()) {
const info = getElementInfo(id, elementStore);
if (!info || !info.isVisibleBasedOnAttrs || isUnlikelyCandidate(id, elementStore))
continue;
if (DEFAULT_TAGS_TO_SCORE.has(info.tagName)) {
elementsToScoreIds.push(id);
}
}
if (debug)
console.log(`Found ${elementsToScoreIds.length} elements to score.`);
// 2. Score elements and propagate to ancestors
for (const elementId of elementsToScoreIds) {
const elementInfo = getElementInfo(elementId, elementStore);
if (!elementInfo || !elementInfo.isVisibleBasedOnAttrs)
continue;
const innerText = getInnerText(elementId, elementStore);
if (innerText.length < 25)
continue;
const ancestorIds = getAncestorIds(elementId, elementStore, 5);
if (ancestorIds.length === 0)
continue;
const contentScore = 1 + innerText.split(COMMAS_REGEX).length + Math.min(Math.floor(innerText.length / 100), 3);
ancestorIds.forEach((ancestorId, level) => {
let ancestorInfo = getElementInfo(ancestorId, elementStore);
if (!ancestorInfo || !ancestorInfo.isVisibleBasedOnAttrs || isUnlikelyCandidate(ancestorId, elementStore))
return;
if (!ancestorInfo.readability) {
ancestorInfo = initializeNodeScore(ancestorId, elementStore);
if (ancestorInfo && (ancestorInfo.readability?.contentScore ?? -1) >= 0) {
candidates.set(ancestorId, ancestorInfo);
}
}
const scoreDivider = level === 0 ? 1 : (level === 1 ? 2 : level * 3);
if (ancestorInfo?.readability) {
ancestorInfo.readability.contentScore += contentScore / scoreDivider;
}
});
}
if (debug)
console.log(`Found ${candidates.size} potential candidates.`);
// 3. Adjust candidate scores
const candidateScores = new Map();
for (const [id, candidateInfo] of candidates.entries()) {
if (candidateInfo.readability) {
const linkDensity = getLinkDensity(id, elementStore);
const finalScore = candidateInfo.readability.contentScore * (1 - linkDensity);
candidateInfo.readability.contentScore = finalScore; // Update
candidateScores.set(id, finalScore);
}
}
// 4. Select top candidate
const sortedCandidates = Array.from(candidateScores.entries()).filter(([, score]) => score > 0).sort((a, b) => b[1] - a[1]);
if (sortedCandidates.length === 0) { /* ... Error handling ... */
return { topCandidateId: null, elementsToKeepIds: [] };
}
let topCandidateId = sortedCandidates[0][0];
let topCandidateScore = sortedCandidates[0][1];
if (debug)
console.log(`Initial Top Candidate: ${getElementInfo(topCandidateId, elementStore)?.tagName}#${topCandidateId} (Score: ${topCandidateScore.toFixed(2)})`);
// 5. Candidate improvement logic (traverse up to parent)
let currentCandidateId = topCandidateId;
let currentScore = topCandidateScore;
let parentId = getParentId(currentCandidateId, elementStore);
while (parentId) {
const parentInfo = getElementInfo(parentId, elementStore);
const parentScore = parentInfo?.readability?.contentScore ?? -1;
if (parentScore < currentScore / 3)
break;
if (parentScore > currentScore) {
currentCandidateId = parentId;
currentScore = parentScore;
parentId = getParentId(currentCandidateId, elementStore);
}
else {
break;
}
}
topCandidateId = currentCandidateId;
topCandidateScore = currentScore;
// (Similarly implement single child element logic)
parentId = getParentId(topCandidateId, elementStore);
let topCandidateInfo = getElementInfo(topCandidateId, elementStore);
while (parentId && topCandidateInfo && topCandidateInfo.tagName !== 'BODY') {
const parentChildren = getChildrenIds(parentId, elementStore);
if (parentChildren.length === 1) {
topCandidateId = parentId;
topCandidateInfo = getElementInfo(topCandidateId, elementStore);
parentId = getParentId(topCandidateId, elementStore);
}
else {
break;
}
}
// 6. Combine sibling elements
const elementsToKeepIdsSet = new Set();
const finalTopCandidateParentId = getParentId(topCandidateId, elementStore);
if (finalTopCandidateParentId && topCandidateId) {
const siblings = getChildrenIds(finalTopCandidateParentId, elementStore);
const siblingScoreThreshold = Math.max(10, topCandidateScore * 0.2);
for (const siblingId of siblings) {
const siblingInfo = getElementInfo(siblingId, elementStore);
if (!siblingInfo || !siblingInfo.isVisibleBasedOnAttrs)
continue;
let append = false;
if (siblingId === topCandidateId)
append = true;
else {
const siblingScore = siblingInfo.readability?.contentScore ?? 0;
const contentBonus = 0; /* ... */
if (siblingScore + contentBonus >= siblingScoreThreshold)
append = true;
else if (siblingInfo.tagName === 'P') { /* ... P tag evaluation ... */ }
}
if (append) {
elementsToKeepIdsSet.add(siblingId);
for (const id of getDescendantIds(siblingId, elementStore)) {
elementsToKeepIdsSet.add(id);
}
}
}
}
else if (topCandidateId) {
elementsToKeepIdsSet.add(topCandidateId);
for (const id of getDescendantIds(topCandidateId, elementStore)) {
elementsToKeepIdsSet.add(id);
}
}
const elementsToKeepIds = Array.from(elementsToKeepIdsSet);
// 7. Check character count
let totalTextLength = 0;
for (const id of elementsToKeepIds) {
const info = getElementInfo(id, elementStore);
if (info?.isVisibleBasedOnAttrs)
totalTextLength += getInnerText(id, elementStore).length;
}
if (debug)
console.log(`Final combined text length (estimated): ${totalTextLength}`);
if (totalTextLength < charThreshold) {
if (debug)
console.log(`Content length ${totalTextLength} is below charThreshold ${charThreshold}. Returning null.`);
return { topCandidateId: null, elementsToKeepIds: [] };
}
// 8. Return results
if (debug)
console.log(`Phase 2 Completed. Final Top Candidate: ${getElementInfo(topCandidateId, elementStore)?.tagName}#${topCandidateId}. Elements to keep: ${elementsToKeepIds.length}`);
return { topCandidateId, elementsToKeepIds };
}