@axync/extract-html-main-content
Version:

116 lines (115 loc) • 4.32 kB
JavaScript
import * as cheerio from "cheerio";
export class HtmlMainContentExtractor {
extract(rawHtmlString, options = {}) {
if (!rawHtmlString) {
return "";
}
const { tryRemoveHiddenElement = true, includeLinkHref = true } = options;
const $ = cheerio.load(rawHtmlString);
$("script, style, noscript, link[rel='stylesheet'], meta, title, head").remove();
if (tryRemoveHiddenElement) {
$("[style*='display:none'], [style*='display: none']").remove();
$("[style*='visibility:hidden'], [style*='visibility: hidden']").remove();
$(".hidden, .hide, .invisible, .sr-only, .screen-reader-only").remove();
$("[hidden]").remove();
}
$("*")
.contents()
.filter(function () {
return this.nodeType === 8;
})
.remove();
const body = $("body");
if (body.length === 0) {
return "";
}
const childNodes = body.children();
if (childNodes.length === 0) {
const mainContent = body.text().trim();
return mainContent;
}
let largestNode = childNodes.first();
let largestNodeSize = 0;
childNodes.each((_, element) => {
const childNode = $(element);
const innerText = childNode.text() || "";
if (innerText.length > largestNodeSize) {
largestNode = childNode;
largestNodeSize = innerText.length;
}
});
let result = "";
let blankNodeCount = 0;
largestNode.find("*").each((_, element) => {
const node = $(element);
const tagName = node.prop("tagName")?.toLowerCase() || "div";
const directTextContent = node
.contents()
.filter(function () {
return this.nodeType === 3;
})
.text()
.trim();
if (directTextContent) {
blankNodeCount = 0;
const markdownContent = this.convertToMarkdown(tagName, directTextContent, node, includeLinkHref);
result += markdownContent + " ";
}
else if (blankNodeCount < 2) {
result += "\n";
blankNodeCount++;
}
});
if (result.trim() === "") {
const tagName = largestNode.prop("tagName")?.toLowerCase() || "div";
const textContent = largestNode.text().trim();
if (textContent) {
const markdownContent = this.convertToMarkdown(tagName, textContent, largestNode, includeLinkHref);
result += markdownContent;
}
}
return result.trim();
}
convertToMarkdown(tagName, textContent, node, includeLinkHref) {
switch (tagName) {
case "h1":
return `\n# ${textContent}\n`;
case "h2":
return `\n## ${textContent}\n`;
case "h3":
return `\n### ${textContent}\n`;
case "h4":
return `\n#### ${textContent}\n`;
case "h5":
return `\n##### ${textContent}\n`;
case "h6":
return `\n###### ${textContent}\n`;
case "p":
return `\n${textContent}\n`;
case "strong":
case "b":
return `**${textContent}**`;
case "em":
case "i":
return `*${textContent}*`;
case "code":
return `\`${textContent}\``;
case "blockquote":
return `> ${textContent}`;
case "li":
return `\n- ${textContent}\n`;
case "a":
if (includeLinkHref && node) {
const href = node.attr("href");
if (href) {
return `[${textContent}](${href})`;
}
}
return textContent;
default:
return textContent;
}
}
}
const htmlMainContentExtractor = new HtmlMainContentExtractor();
export const extractHtmlMainContent = htmlMainContentExtractor.extract.bind(htmlMainContentExtractor);