UNPKG

@axync/extract-html-main-content

Version:

![Test](https://github.com/AnxinYang/axync/actions/workflows/test.yml/badge.svg)

116 lines (115 loc) 4.32 kB
import * as cheerio from "cheerio"; export class HtmlMainContentExtractor { extract(rawHtmlString, options = {}) { if (!rawHtmlString) { return ""; } const { tryRemoveHiddenElement = true, includeLinkHref = true } = options; const $ = cheerio.load(rawHtmlString); $("script, style, noscript, link[rel='stylesheet'], meta, title, head").remove(); if (tryRemoveHiddenElement) { $("[style*='display:none'], [style*='display: none']").remove(); $("[style*='visibility:hidden'], [style*='visibility: hidden']").remove(); $(".hidden, .hide, .invisible, .sr-only, .screen-reader-only").remove(); $("[hidden]").remove(); } $("*") .contents() .filter(function () { return this.nodeType === 8; }) .remove(); const body = $("body"); if (body.length === 0) { return ""; } const childNodes = body.children(); if (childNodes.length === 0) { const mainContent = body.text().trim(); return mainContent; } let largestNode = childNodes.first(); let largestNodeSize = 0; childNodes.each((_, element) => { const childNode = $(element); const innerText = childNode.text() || ""; if (innerText.length > largestNodeSize) { largestNode = childNode; largestNodeSize = innerText.length; } }); let result = ""; let blankNodeCount = 0; largestNode.find("*").each((_, element) => { const node = $(element); const tagName = node.prop("tagName")?.toLowerCase() || "div"; const directTextContent = node .contents() .filter(function () { return this.nodeType === 3; }) .text() .trim(); if (directTextContent) { blankNodeCount = 0; const markdownContent = this.convertToMarkdown(tagName, directTextContent, node, includeLinkHref); result += markdownContent + " "; } else if (blankNodeCount < 2) { result += "\n"; blankNodeCount++; } }); if (result.trim() === "") { const tagName = largestNode.prop("tagName")?.toLowerCase() || "div"; const textContent = largestNode.text().trim(); if (textContent) { const markdownContent = this.convertToMarkdown(tagName, textContent, largestNode, includeLinkHref); result += markdownContent; } } return result.trim(); } convertToMarkdown(tagName, textContent, node, includeLinkHref) { switch (tagName) { case "h1": return `\n# ${textContent}\n`; case "h2": return `\n## ${textContent}\n`; case "h3": return `\n### ${textContent}\n`; case "h4": return `\n#### ${textContent}\n`; case "h5": return `\n##### ${textContent}\n`; case "h6": return `\n###### ${textContent}\n`; case "p": return `\n${textContent}\n`; case "strong": case "b": return `**${textContent}**`; case "em": case "i": return `*${textContent}*`; case "code": return `\`${textContent}\``; case "blockquote": return `> ${textContent}`; case "li": return `\n- ${textContent}\n`; case "a": if (includeLinkHref && node) { const href = node.attr("href"); if (href) { return `[${textContent}](${href})`; } } return textContent; default: return textContent; } } } const htmlMainContentExtractor = new HtmlMainContentExtractor(); export const extractHtmlMainContent = htmlMainContentExtractor.extract.bind(htmlMainContentExtractor);