UNPKG

magnitude-extract

Version:

TypeScript DOM cleaning and structuring library

1,460 lines (1,453 loc) 70.8 kB
#!/usr/bin/env node 'use strict'; var cheerio = require('cheerio'); var uuid = require('uuid'); var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null; function _interopNamespace(e) { if (e && e.__esModule) return e; var n = Object.create(null); if (e) { Object.keys(e).forEach(function (k) { if (k !== 'default') { var d = Object.getOwnPropertyDescriptor(e, k); Object.defineProperty(n, k, d.get ? d : { enumerable: true, get: function () { return e[k]; } }); } }); } n.default = e; return Object.freeze(n); } var cheerio__namespace = /*#__PURE__*/_interopNamespace(cheerio); // src/mappings.ts var TAG_TO_ELEMENT_TYPE = { // Headings and titles "h1": "Title" /* TITLE */, "h2": "Title" /* TITLE */, "h3": "Title" /* TITLE */, "h4": "Title" /* TITLE */, "h5": "Title" /* TITLE */, "h6": "Title" /* TITLE */, "title": "Title" /* TITLE */, // Document title // Text content "p": "NarrativeText" /* NARRATIVE_TEXT */, "div": "Text" /* TEXT */, // Will be refined by content analysis "span": "Text" /* TEXT */, "article": "NarrativeText" /* NARRATIVE_TEXT */, "section": "NarrativeText" /* NARRATIVE_TEXT */, "main": "NarrativeText" /* NARRATIVE_TEXT */, "aside": "Text" /* TEXT */, // Specialized text elements "blockquote": "NarrativeText" /* NARRATIVE_TEXT */, "q": "NarrativeText" /* NARRATIVE_TEXT */, "cite": "NarrativeText" /* NARRATIVE_TEXT */, "abbr": "Text" /* TEXT */, "acronym": "Text" /* TEXT */, "dfn": "Text" /* TEXT */, "time": "Text" /* TEXT */, // Lists "li": "ListItem" /* LIST_ITEM */, "ul": "List" /* LIST */, "ol": "List" /* LIST */, "dl": "List" /* LIST */, "dt": "ListItem" /* LIST_ITEM */, "dd": "ListItem" /* LIST_ITEM */, // Tables "table": "Table" /* TABLE */, "thead": "Table" /* TABLE */, "tbody": "Table" /* TABLE */, "tfoot": "Table" /* TABLE */, "tr": "Table" /* TABLE */, "td": "Table" /* TABLE */, "th": "Table" /* TABLE */, "caption": "Caption" /* CAPTION */, "colgroup": "Table" /* TABLE */, "col": "Table" /* TABLE */, // Media elements "img": "Image" /* IMAGE */, "figure": "Figure" /* FIGURE */, "picture": "Picture" /* PICTURE */, "figcaption": "FigureCaption" /* FIGURE_CAPTION */, "video": "Image" /* IMAGE */, // Treat as media element "audio": "Image" /* IMAGE */, // Treat as media element "canvas": "Image" /* IMAGE */, "svg": "Image" /* IMAGE */, // Code elements "code": "CodeSnippet" /* CODE_SNIPPET */, "pre": "CodeSnippet" /* CODE_SNIPPET */, "kbd": "CodeSnippet" /* CODE_SNIPPET */, "samp": "CodeSnippet" /* CODE_SNIPPET */, "var": "CodeSnippet" /* CODE_SNIPPET */, // Navigation (usually filtered) "nav": "Navigation" /* NAVIGATION */, "menu": "Navigation" /* NAVIGATION */, "menuitem": "Navigation" /* NAVIGATION */, // Headers/Footers "header": "Header" /* HEADER */, "footer": "Footer" /* FOOTER */, // Forms - only form itself should be mapped directly, others handled by specialized logic "form": "Form" /* FORM */, "label": "FieldName" /* FIELD_NAME */, "legend": "FieldName" /* FIELD_NAME */, "option": "Value" /* VALUE */, "output": "Value" /* VALUE */, "progress": "Value" /* VALUE */, "meter": "Value" /* VALUE */, // Contact information (removed - caused false positives) // Links "a": "Link" /* LINK */, // Document structure "hr": "PageBreak" /* PAGE_BREAK */, "br": "Text" /* TEXT */, // Line break, usually part of text // Mathematical content "math": "Formula" /* FORMULA */, "mrow": "Formula" /* FORMULA */, "mi": "Formula" /* FORMULA */, "mn": "Formula" /* FORMULA */, "mo": "Formula" /* FORMULA */, "mfrac": "Formula" /* FORMULA */, "msup": "Formula" /* FORMULA */, "msub": "Formula" /* FORMULA */, "msubsup": "Formula" /* FORMULA */, "munder": "Formula" /* FORMULA */, "mover": "Formula" /* FORMULA */, "munderover": "Formula" /* FORMULA */, "msqrt": "Formula" /* FORMULA */, "mroot": "Formula" /* FORMULA */, "mtext": "Formula" /* FORMULA */, "mspace": "Formula" /* FORMULA */, "mstyle": "Formula" /* FORMULA */, "merror": "Formula" /* FORMULA */, "mpadded": "Formula" /* FORMULA */, "mphantom": "Formula" /* FORMULA */, "mfenced": "Formula" /* FORMULA */, "menclose": "Formula" /* FORMULA */, "mtable": "Formula" /* FORMULA */, "mtr": "Formula" /* FORMULA */, "mtd": "Formula" /* FORMULA */, "maligngroup": "Formula" /* FORMULA */, "malignmark": "Formula" /* FORMULA */, "mlabeledtr": "Formula" /* FORMULA */, "maction": "Formula" /* FORMULA */, "semantics": "Formula" /* FORMULA */, "annotation": "Formula" /* FORMULA */, "annotation-xml": "Formula" /* FORMULA */ }; var CSS_CLASS_PATTERNS = [ // Navigation patterns { pattern: /\b(nav|menu|breadcrumb|sidebar|navigation|navbar|menubar)\b/i, elementType: "Navigation" /* NAVIGATION */ }, // Header/Footer patterns { pattern: /\b(header|masthead|banner|site-header|page-header|main-header)\b/i, elementType: "PageHeader" /* PAGE_HEADER */ }, { pattern: /\b(footer|copyright|legal|site-footer|page-footer|main-footer)\b/i, elementType: "PageFooter" /* PAGE_FOOTER */ }, { pattern: /\b(section-header|content-header)\b/i, elementType: "SectionHeader" /* SECTION_HEADER */ }, // Title and heading patterns { pattern: /\b(title|heading|headline|h[1-6]|header-text)\b/i, elementType: "Title" /* TITLE */ }, { pattern: /\b(subtitle|subheading|sub-title|sub-heading)\b/i, elementType: "Subheadline" /* SUB_HEADLINE */ }, // Content patterns { pattern: /\b(content|article|post|story|narrative|text|body|main-content)\b/i, elementType: "NarrativeText" /* NARRATIVE_TEXT */ }, { pattern: /\b(paragraph|para|text-block)\b/i, elementType: "Paragraph" /* PARAGRAPH */ }, { pattern: /\b(abstract|summary|synopsis)\b/i, elementType: "Abstract" /* ABSTRACT */ }, // List patterns { pattern: /\b(list|item|bullet|numbered|ordered|unordered)\b/i, elementType: "ListItem" /* LIST_ITEM */ }, // Form patterns - only match actual form containers, not styling divs { pattern: /\b(form-container|form-wrapper|contact-form|login-form|signup-form)\b/i, elementType: "Form" /* FORM */ }, { pattern: /\b(label|field-name|form-label)\b/i, elementType: "FieldName" /* FIELD_NAME */ }, { pattern: /\b(value|field-value|input-value)\b/i, elementType: "Value" /* VALUE */ }, // Table patterns { pattern: /\b(table|grid|data|tabular|spreadsheet)\b/i, elementType: "Table" /* TABLE */ }, // Media patterns { pattern: /\b(image|img|picture|photo|figure|media)\b/i, elementType: "Image" /* IMAGE */ }, { pattern: /\b(caption|img-caption|figure-caption|photo-caption)\b/i, elementType: "FigureCaption" /* FIGURE_CAPTION */ }, // Code patterns { pattern: /\b(code|highlight|syntax|language-|hljs|prettyprint|source|snippet)\b/i, elementType: "CodeSnippet" /* CODE_SNIPPET */ }, // Address and email patterns removed - caused false positives // Mathematical patterns { pattern: /\b(math|latex|katex|mathjax|formula|equation)\b/i, elementType: "Formula" /* FORMULA */ }, // Footnote patterns { pattern: /\b(footnote|endnote|note|reference)\b/i, elementType: "Footnote" /* FOOTNOTE */ }, // Page number patterns { pattern: /\b(page-number|pagination|page-info)\b/i, elementType: "PageNumber" /* PAGE_NUMBER */ }, // Link patterns { pattern: /\b(link|hyperlink|url|href)\b/i, elementType: "Link" /* LINK */ } ]; var IGNORED_TAGS = /* @__PURE__ */ new Set([ "script", "style", "meta", "link", "noscript", "iframe", // we expand these automatically anyway before processing "object", "embed", "applet" ]); var NAVIGATION_TAGS = /* @__PURE__ */ new Set([ "nav", "menu", "menuitem", "aside" // Often sidebars ]); var INLINE_TAGS = /* @__PURE__ */ new Set([ "a", "strong", "b", "em", "i", "u", "span", "code", "kbd", "samp", "var", "mark", "small", "sub", "sup" ]); // src/cleaner.ts var DOMCleaner = class { constructor(options = {}) { this.options = options; } /** * Clean and normalize HTML content */ clean($) { this.removeIgnoredTags($); if (this.options.skipNavigation) { this.removeNavigationElements($); } if (this.options.skipHeaders) { this.removeHeaders($); } if (this.options.skipFooters) { this.removeFooters($); } if (this.options.skipHeadersAndFooters) { this.removeHeaders($); this.removeFooters($); } if (this.options.skipForms) { this.removeForms($); } this.normalizeWhitespace($); this.removeEmptyElements($); return $; } /** * Remove script, style, and other ignored tags */ removeIgnoredTags($) { IGNORED_TAGS.forEach((tag) => { $(tag).remove(); }); $("*").contents().filter(function() { return this.type === "comment"; }).remove(); } /** * Remove navigation elements */ removeNavigationElements($) { NAVIGATION_TAGS.forEach((tag) => { $(tag).remove(); }); const navSelectors = [ '[class*="nav"]', '[class*="menu"]', '[class*="breadcrumb"]', '[class*="sidebar"]', '[id*="nav"]', '[id*="menu"]', '[role="navigation"]', '[role="menu"]', '[role="menubar"]' ]; navSelectors.forEach((selector) => { $(selector).remove(); }); } /** * Remove header elements */ removeHeaders($) { const headerSelectors = [ "header", '[class*="header"]', '[class*="masthead"]', '[class*="banner"]', '[id*="header"]', '[role="banner"]' ]; headerSelectors.forEach((selector) => { $(selector).remove(); }); } /** * Remove footer elements */ removeFooters($) { const footerSelectors = [ "footer", '[class*="footer"]', '[class*="copyright"]', '[class*="legal"]', '[id*="footer"]', '[role="contentinfo"]' ]; footerSelectors.forEach((selector) => { $(selector).remove(); }); } /** * Remove form elements */ removeForms($) { $("form, input, button, select, textarea").remove(); } /** * Normalize whitespace in text content */ normalizeWhitespace($) { if (this.options.preserveWhitespace) { return; } $("*").contents().filter(function() { return this.type === "text"; }).each((_, node) => { if ("data" in node && node.data) { node.data = node.data.replace(/\\s+/g, " "); } }); } /** * Remove empty elements that don't contribute content */ removeEmptyElements($) { $("*").each((_, element) => { const $el = $(element); const text = $el.text().trim(); const tagName = element.tagName; const hasImages = $el.find("img").length > 0; const hasTables = $el.find("table").length > 0; const hasInputs = $el.find("input, button, select, textarea").length > 0; const isHeading = /^h[1-6]$/i.test(tagName); const isTableCell = /^(td|th)$/i.test(tagName); const isImage = tagName === "img"; const isFormElement = /^(input|button|select|textarea|form|label|fieldset|legend)$/i.test(tagName); const isTable = tagName === "table"; const shouldKeep = text.length >= (this.options.minTextLength || 3) || isHeading || isTableCell || isImage && this.options.extractImages || hasImages && this.options.extractImages || hasTables && this.options.extractTables || isTable && this.options.extractTables || hasInputs && !this.options.skipForms || isFormElement && !this.options.skipForms; if (!shouldKeep && !this.hasSignificantChildren($el)) { $el.remove(); } }); } /** * Check if element has children that should be preserved */ hasSignificantChildren($el) { const significantTags = ["img", "table", "h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "form", "input", "textarea", "select", "button", "td", "th", "tr"]; return significantTags.some((tag) => $el.find(tag).length > 0); } }; var ElementClassifier = class { /** * Classify a DOM element based on its tag, attributes, and content */ classifyElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); if (!tagName) { return "UncategorizedText" /* UNCATEGORIZED_TEXT */; } const tagType = TAG_TO_ELEMENT_TYPE[tagName]; if (tagType && ["table", "form"].includes(tagName)) { return tagType; } const specializedType = this.classifySpecializedElement($el); if (specializedType !== "UncategorizedText" /* UNCATEGORIZED_TEXT */) { return specializedType; } if (tagType && tagType !== "Text" /* TEXT */) { return tagType; } const classType = this.classifyByCSS($el); if (classType !== "UncategorizedText" /* UNCATEGORIZED_TEXT */) { return classType; } return this.classifyByContent($el); } /** * Classify specialized elements (forms, addresses, emails, code, etc.) */ classifySpecializedElement($el) { if (this.isFormElement($el)) { return this.classifyFormElement($el); } if (this.isCodeElement($el)) { return "CodeSnippet" /* CODE_SNIPPET */; } if (this.isPageNumberElement($el)) { return "PageNumber" /* PAGE_NUMBER */; } if (this.isAbstractElement($el)) { return "Abstract" /* ABSTRACT */; } const headerFooterType = this.classifyHeaderFooter($el); if (headerFooterType !== "UncategorizedText" /* UNCATEGORIZED_TEXT */) { return headerFooterType; } return "UncategorizedText" /* UNCATEGORIZED_TEXT */; } /** * Check if element is a form-related element */ isFormElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const formTags = ["form", "input", "textarea", "select", "button", "fieldset", "legend", "label"]; const type = $el.attr("type")?.toLowerCase(); return formTags.includes(tagName || "") || tagName === "input" && ["checkbox", "radio", "submit", "button"].includes(type || ""); } /** * Classify form elements into specific types */ classifyFormElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const type = $el.attr("type")?.toLowerCase(); const checked = $el.prop("checked"); switch (tagName) { case "form": return "Form" /* FORM */; case "input": switch (type) { case "checkbox": return checked ? "CheckBoxChecked" /* CHECK_BOX_CHECKED */ : "CheckBoxUnchecked" /* CHECK_BOX_UNCHECKED */; case "radio": return checked ? "RadioButtonChecked" /* RADIO_BUTTON_CHECKED */ : "RadioButtonUnchecked" /* RADIO_BUTTON_UNCHECKED */; default: return "Value" /* VALUE */; } case "label": return "FieldName" /* FIELD_NAME */; case "textarea": case "select": return "Value" /* VALUE */; case "fieldset": return "Text" /* TEXT */; case "legend": return "FieldName" /* FIELD_NAME */; case "button": return "Value" /* VALUE */; default: if (tagName === "form") { return "Form" /* FORM */; } return "Text" /* TEXT */; } } /** * Check if element contains code */ isCodeElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const className = $el.attr("class") || ""; const codeTags = ["code", "pre", "kbd", "samp", "var"]; const codeClasses = /\b(code|highlight|syntax|language-|hljs|prettyprint)\b/i; return codeTags.includes(tagName || "") || codeClasses.test(className); } /** * Check if element contains an address */ isAddressElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const text = $el.text().trim(); const className = $el.attr("class") || ""; if (tagName === "address") { return true; } const addressClasses = /\b(address|location|postal)\b/i; const formClasses = /\b(form|contact-form|login-form|signup-form)\b/i; if (addressClasses.test(className) && !formClasses.test(className)) { return true; } if (/\bcontact\b/i.test(className) && !/form/i.test(className)) { return true; } const addressPatterns = [ /\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)/i, /\b\d{5}(?:-\d{4})?\b/, // ZIP codes /\b[A-Z]{2}\s+\d{5}\b/ // State + ZIP ]; return addressPatterns.some((pattern) => pattern.test(text)); } /** * Check if element contains an email address */ isEmailAddressElement($el) { const text = $el.text().trim(); const emailPattern = /\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b/; return emailPattern.test(text); } /** * Check if element contains a mathematical formula */ isFormulaElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const className = $el.attr("class") || ""; const text = $el.text().trim(); const mathTags = ["math", "mrow", "mi", "mn", "mo", "mfrac", "msup", "msub"]; if (mathTags.includes(tagName || "")) { return true; } const mathClasses = /\b(math|latex|katex|mathjax|formula|equation)\b/i; if (mathClasses.test(className)) { return true; } const mathPatterns = [ /[∑∏∫∂∇∆√∞±≤≥≠≈∈∉⊂⊃∪∩]/, /\$.*\$/, // LaTeX delimiters /\\[a-zA-Z]+\{.*\}/ // LaTeX commands ]; return mathPatterns.some((pattern) => pattern.test(text)); } /** * Check if element is a caption */ isCaptionElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const className = $el.attr("class") || ""; const captionTags = ["caption", "figcaption"]; const captionClasses = /\b(caption|figure-caption|img-caption)\b/i; return captionTags.includes(tagName || "") || captionClasses.test(className); } /** * Classify caption elements */ classifyCaptionElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); if (tagName === "figcaption") { return "FigureCaption" /* FIGURE_CAPTION */; } const parent = $el.parent(); const parentTag = parent.prop("tagName")?.toLowerCase(); if (parentTag === "figure" || parent.find("img").length > 0) { return "FigureCaption" /* FIGURE_CAPTION */; } return "Caption" /* CAPTION */; } /** * Check if element is a footnote */ isFootnoteElement($el) { const className = $el.attr("class") || ""; const id = $el.attr("id") || ""; const text = $el.text().trim(); const footnoteClasses = /\b(footnote|endnote|note)\b/i; const footnoteIds = /\b(fn|footnote|note)-?\d+\b/i; const footnotePatterns = /^\[\d+\]|\(\d+\)|^\d+\./; return footnoteClasses.test(className) || footnoteIds.test(id) || footnotePatterns.test(text); } /** * Check if element contains a page number */ isPageNumberElement($el) { const text = $el.text().trim(); const className = $el.attr("class") || ""; const pageClasses = /\b(page-?number|pagination)\b/i; const pagePatterns = [ /^Page\s+\d+$/i, /^\d+\s+of\s+\d+$/i, /^\d+\s*\/\s*\d+$/, /^-?\s*\d+\s*-?$/ ]; return pageClasses.test(className) || pagePatterns.some((pattern) => pattern.test(text)); } /** * Check if element is an abstract */ isAbstractElement($el) { const className = $el.attr("class") || ""; const id = $el.attr("id") || ""; const text = $el.text().trim(); const abstractClasses = /\b(abstract|summary)\b/i; const abstractIds = /\babstract\b/i; const abstractStart = /^abstract\b/i; return abstractClasses.test(className) || abstractIds.test(id) || abstractStart.test(text); } /** * Classify header and footer elements */ classifyHeaderFooter($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const className = $el.attr("class") || ""; const id = $el.attr("id") || ""; if (tagName === "header") { return this.isPageLevelHeader($el) ? "PageHeader" /* PAGE_HEADER */ : "Header" /* HEADER */; } if (tagName === "footer") { return this.isPageLevelFooter($el) ? "PageFooter" /* PAGE_FOOTER */ : "Footer" /* FOOTER */; } const headerClasses = /\b(header|masthead|banner)\b/i; const footerClasses = /\b(footer|contentinfo)\b/i; const pageHeaderClasses = /\b(page-header|site-header|main-header)\b/i; const pageFooterClasses = /\b(page-footer|site-footer|main-footer)\b/i; const combinedClasses = `${className} ${id}`; if (pageHeaderClasses.test(combinedClasses)) { return "PageHeader" /* PAGE_HEADER */; } if (pageFooterClasses.test(combinedClasses)) { return "PageFooter" /* PAGE_FOOTER */; } if (headerClasses.test(combinedClasses)) { return "Header" /* HEADER */; } if (footerClasses.test(combinedClasses)) { return "Footer" /* FOOTER */; } return "UncategorizedText" /* UNCATEGORIZED_TEXT */; } /** * Check if header is page-level */ isPageLevelHeader($el) { const parent = $el.parent(); const parentTag = parent.prop("tagName")?.toLowerCase(); return ["body", "main", "html"].includes(parentTag || "") || parent.hasClass("page") || parent.hasClass("container") || parent.hasClass("wrapper"); } /** * Check if footer is page-level */ isPageLevelFooter($el) { const parent = $el.parent(); const parentTag = parent.prop("tagName")?.toLowerCase(); return ["body", "main", "html"].includes(parentTag || "") || parent.hasClass("page") || parent.hasClass("container") || parent.hasClass("wrapper"); } /** * Classify element based on CSS classes */ classifyByCSS($el) { const className = $el.attr("class") || ""; const id = $el.attr("id") || ""; const combinedClasses = `${className} ${id}`.toLowerCase(); for (const { pattern, elementType } of CSS_CLASS_PATTERNS) { if (pattern.test(combinedClasses)) { return elementType; } } return "UncategorizedText" /* UNCATEGORIZED_TEXT */; } /** * Classify element based on content analysis */ classifyByContent($el) { const text = $el.text().trim(); const tagName = $el.prop("tagName")?.toLowerCase(); if (text.length < 3) { return "UncategorizedText" /* UNCATEGORIZED_TEXT */; } if (this.looksLikeTitle(text, $el)) { return "Title" /* TITLE */; } if (this.looksLikeListItem(text, $el)) { return "ListItem" /* LIST_ITEM */; } switch (tagName) { case "div": case "section": case "article": return text.length > 50 ? "NarrativeText" /* NARRATIVE_TEXT */ : "Text" /* TEXT */; case "span": return "Text" /* TEXT */; case "p": return "NarrativeText" /* NARRATIVE_TEXT */; default: return "Text" /* TEXT */; } } /** * Heuristics to determine if text looks like a title/heading */ looksLikeTitle(text, $el) { if (text.length > 100) { return false; } const hasCapitalization = /^[A-Z]/.test(text) && text === text.charAt(0).toUpperCase() + text.slice(1); const hasColonOrDash = /[:\\-–—]/.test(text); const isAllCaps = text === text.toUpperCase() && text.length > 3; const hasNumbers = /^\\d+\\.?\\s/.test(text); const parent = $el.parent(); const parentTag = parent.prop("tagName")?.toLowerCase(); const isInHeader = parentTag === "header" || parent.closest("header").length > 0; const style = $el.attr("style") || ""; const hasBoldStyle = /font-weight\\s*:\\s*(bold|[6-9]00)/i.test(style); const hasLargeFont = /font-size\\s*:\\s*([2-9]\\d|\\d{3,})px/i.test(style); return hasCapitalization || hasColonOrDash || isAllCaps || hasNumbers || isInHeader || hasBoldStyle || hasLargeFont; } /** * Heuristics to determine if text looks like a list item */ looksLikeListItem(text, $el) { const bulletPatterns = [ /^[•·▪▫‣⁃]\\s/, // Unicode bullets /^[-*+]\\s/, // ASCII bullets /^\\d+\\.\\s/, // Numbered lists /^[a-zA-Z]\\.\\s/, // Lettered lists /^\\([a-zA-Z0-9]+\\)\\s/ // Parenthetical lists ]; const startsWithBullet = bulletPatterns.some((pattern) => pattern.test(text)); const parent = $el.parent(); const parentTag = parent.prop("tagName")?.toLowerCase(); const isInList = ["ul", "ol", "dl"].includes(parentTag || ""); const siblings = parent.children().not($el); const siblingTexts = siblings.map((_, el) => cheerio__namespace.load(el).text().trim()).get(); const siblingsWithBullets = siblingTexts.filter( (siblingText) => bulletPatterns.some((pattern) => pattern.test(siblingText)) ).length; const mostSiblingsAreBullets = siblingsWithBullets > siblings.length * 0.5; return startsWithBullet || isInList || mostSiblingsAreBullets; } /** * Check if element should be treated as inline (part of parent's text) */ isInlineElement($el) { const tagName = $el.prop("tagName")?.toLowerCase(); return INLINE_TAGS.has(tagName || ""); } /** * Extract clean text from element, handling inline elements appropriately */ extractCleanText($el) { const $clone = $el.clone(); $clone.find("*").each((_, child) => { const $child = cheerio__namespace.load(child); const childEl = $child.root().children().first(); if (this.isInlineElement(childEl)) { childEl.replaceWith(childEl.text()); } }); return $clone.text().replace(/\\s+/g, " ").trim(); } }; // src/content-handlers.ts var ContentHandlers = class { /** * Extract form fields and their values */ extractFormFields($, $form) { const fields = []; $form.find("input, textarea, select").each((_, element) => { const $el = $(element); const fieldType = $el.attr("type") || $el.prop("tagName")?.toLowerCase(); const fieldName = $el.attr("name") || $el.attr("id") || $el.prev("label").text().trim() || "field"; let fieldValue = ""; switch (fieldType) { case "checkbox": case "radio": fieldValue = $el.prop("checked") ? $el.attr("value") || "checked" : "unchecked"; break; case "select": fieldValue = $el.find("option:selected").text() || $el.find("option").first().text(); break; default: fieldValue = $el.attr("value") || $el.text().trim(); } { fields.push({ fieldName, fieldValue, fieldType }); } }); $form.find("label").each((_, element) => { const $label = $(element); const forAttr = $label.attr("for"); const labelText = $label.text().trim(); if (forAttr && labelText) { const $input = $form.find(`#${forAttr}`); if ($input.length > 0) { const existingField = fields.find((f) => f.fieldName === labelText); if (!existingField) { const fieldType = $input.attr("type") || $input.prop("tagName")?.toLowerCase(); let fieldValue = ""; switch (fieldType) { case "checkbox": case "radio": fieldValue = $input.prop("checked") ? $input.attr("value") || "checked" : "unchecked"; break; default: fieldValue = $input.attr("value") || $input.text().trim(); } fields.push({ fieldName: labelText, fieldValue, fieldType }); } } } }); return fields; } /** * Extract links with metadata */ extractLinks($, $el) { const links = []; $el.find("a[href]").each((_, element) => { const $link = $(element); const href = $link.attr("href"); const text = $link.text().trim(); if (href && text) { links.push({ text, url: href, startIndex: 0 // Would need more complex text analysis to determine actual position }); } }); return links; } /** * Parse address components */ parseAddress(text) { const components = {}; const streetMatch = text.match(/(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl))/i); if (streetMatch) { components.street = streetMatch[1].trim(); } const zipMatch = text.match(/\b(\d{5}(?:-\d{4})?)\b/); if (zipMatch) { components.zipCode = zipMatch[1]; } const stateZipMatch = text.match(/\b([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\b/); if (stateZipMatch) { components.state = stateZipMatch[1]; components.zipCode = stateZipMatch[2]; } const cityMatch = text.match(/,\s*([A-Za-z\s]+),?\s*[A-Z]{2}\s*\d{5}/); if (cityMatch) { components.city = cityMatch[1].trim(); } return Object.keys(components).length > 0 ? components : void 0; } /** * Extract email addresses from text */ extractEmailAddresses(text) { const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g; return text.match(emailPattern) || []; } /** * Detect programming language in code blocks */ detectCodeLanguage($el) { const className = $el.attr("class") || ""; const languagePatterns = [ { pattern: /language-(\w+)/i, group: 1 }, { pattern: /lang-(\w+)/i, group: 1 }, { pattern: /highlight-(\w+)/i, group: 1 }, { pattern: /\b(javascript|js|typescript|ts|python|java|cpp|c\+\+|csharp|c#|php|ruby|go|rust|swift|kotlin|scala|html|css|sql|bash|shell|powershell|yaml|json|xml)\b/i, group: 1 } ]; for (const { pattern, group } of languagePatterns) { const match = className.match(pattern); if (match && match[group]) { return match[group].toLowerCase(); } } const dataLang = $el.attr("data-language") || $el.attr("data-lang"); if (dataLang) { return dataLang.toLowerCase(); } const text = $el.text(); return this.detectLanguageFromContent(text); } /** * Detect programming language from code content */ detectLanguageFromContent(code) { const languagePatterns = [ { language: "javascript", patterns: [/\b(function|const|let|var|=>|console\.log)\b/, /\$\(.*\)/, /require\(.*\)/] }, { language: "typescript", patterns: [/\b(interface|type|enum)\b/, /:\s*(string|number|boolean)/, /\bas\s+\w+/] }, { language: "python", patterns: [/\b(def|import|from|class|if __name__)\b/, /print\(/, /\bself\b/] }, { language: "java", patterns: [/\b(public|private|class|static|void)\b/, /System\.out\.println/, /\bString\[\]/] }, { language: "cpp", patterns: [/\b(#include|using namespace|std::)\b/, /cout\s*<</, /\bint main\b/] }, { language: "csharp", patterns: [/\b(using|namespace|public class)\b/, /Console\.WriteLine/, /\bstring\[\]/] }, { language: "php", patterns: [/<\?php/, /\$\w+/, /echo\s+/] }, { language: "ruby", patterns: [/\b(def|end|class|require)\b/, /puts\s+/, /\@\w+/] }, { language: "go", patterns: [/\b(package|import|func|var)\b/, /fmt\.Print/, /\bgo\s+\w+/] }, { language: "rust", patterns: [/\b(fn|let|mut|use|struct)\b/, /println!/, /\bSome\(|\bNone\b/] }, { language: "html", patterns: [/<\/?[a-z][\s\S]*>/i, /<!DOCTYPE/, /&\w+;/] }, { language: "css", patterns: [/\{[^}]*\}/, /\.[a-zA-Z][\w-]*\s*\{/, /@media\s+/] }, { language: "sql", patterns: [/\b(SELECT|FROM|WHERE|INSERT|UPDATE|DELETE)\b/i, /\bJOIN\b/i, /\bGROUP BY\b/i] }, { language: "bash", patterns: [/^#!/, /\$\w+/, /\becho\s+/, /\|\s*\w+/] }, { language: "json", patterns: [/^\s*\{[\s\S]*\}\s*$/, /"\w+":\s*/, /\[\s*\{/] }, { language: "xml", patterns: [/<\?xml/, /<\/\w+>/, /xmlns:/] }, { language: "yaml", patterns: [/^\s*\w+:\s*/, /^---/, /^\s*-\s+/] } ]; for (const { language, patterns } of languagePatterns) { if (patterns.some((pattern) => pattern.test(code))) { return language; } } return void 0; } /** * Extract mathematical formula type */ detectFormulaType($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const text = $el.text(); if (tagName && ["math", "mrow", "mi", "mn", "mo"].includes(tagName)) { return "mathml"; } if (/\$.*\$|\\[a-zA-Z]+\{.*\}|\\begin\{.*\}/.test(text)) { return "latex"; } return "text"; } /** * Extract coordinates from element positioning */ extractCoordinates($el) { const style = $el.attr("style") || ""; const position = {}; const topMatch = style.match(/top:\s*(\d+)px/); const leftMatch = style.match(/left:\s*(\d+)px/); const widthMatch = style.match(/width:\s*(\d+)px/); const heightMatch = style.match(/height:\s*(\d+)px/); if (topMatch) position.top = parseInt(topMatch[1]); if (leftMatch) position.left = parseInt(leftMatch[1]); if (widthMatch) position.width = parseInt(widthMatch[1]); if (heightMatch) position.height = parseInt(heightMatch[1]); if (position.top !== void 0 && position.left !== void 0) { const points = [ { x: position.left, y: position.top }, { x: position.left + (position.width || 0), y: position.top }, { x: position.left + (position.width || 0), y: position.top + (position.height || 0) }, { x: position.left, y: position.top + (position.height || 0) } ]; return { points, system: { width: 1920, // Default viewport width height: 1080, // Default viewport height coordinateUnit: "pixels" } }; } return void 0; } /** * Extract emphasized text and their tags */ extractEmphasis($, $el) { const contents = []; const tags = []; const emphasisTags = ["strong", "b", "em", "i", "u", "mark", "ins", "del"]; emphasisTags.forEach((tag) => { $el.find(tag).each((_, element) => { const $emphEl = $(element); const text = $emphEl.text().trim(); if (text) { contents.push(text); tags.push(tag); } }); }); return { contents, tags }; } /** * Detect if text contains page break indicators */ isPageBreak($el) { const tagName = $el.prop("tagName")?.toLowerCase(); const className = $el.attr("class") || ""; const style = $el.attr("style") || ""; if (tagName === "hr") { return true; } const pageBreakClasses = /\b(page-break|pagebreak|new-page)\b/i; const pageBreakStyles = /page-break-(before|after):\s*(always|page)/i; return pageBreakClasses.test(className) || pageBreakStyles.test(style); } /** * Extract table structure with headers and data */ extractTableStructure($, $table) { const headers = []; const rows = []; const $thead = $table.find("thead"); if ($thead.length > 0) { $thead.find("tr").first().find("th, td").each((_, cell) => { headers.push($(cell).text().trim()); }); } else { const $firstRow = $table.find("tr").first(); const $thCells = $firstRow.find("th"); if ($thCells.length > 0) { $thCells.each((_, cell) => { headers.push($(cell).text().trim()); }); } } const $dataRows = headers.length > 0 ? $table.find("tbody tr, tr").not($table.find("thead tr")) : $table.find("tr"); $dataRows.each((_, row) => { const $row = $(row); const rowData = []; $row.find("td, th").each((_2, cell) => { rowData.push($(cell).text().trim()); }); if (rowData.length > 0) { rows.push(rowData); } }); if (headers.length === 0 && rows.length > 0) { headers.push(...rows.shift()); } return { headers: headers.length > 0 ? headers : void 0, rows }; } }; // src/partitioner.ts var DOMPartitioner = class { // For parent-child relationships constructor(options = {}) { this.elementIdMap = /* @__PURE__ */ new Map(); this.options = { skipNavigation: true, skipHeaders: false, skipFooters: false, skipForms: false, skipHeadersAndFooters: false, minTextLength: 3, maxTextLength: void 0, preserveWhitespace: false, extractTables: true, inferTableStructure: true, skipInferTableTypes: [], extractImages: true, includeImageAlt: true, extractImageBlockTypes: [], extractImageBlockToPayload: false, extractImageBlockOutputDir: void 0, extractForms: true, extractFormFields: true, extractLinks: true, languages: void 0, detectLanguagePerElement: false, includeCoordinates: false, coordinateSystem: void 0, includePageBreaks: true, maintainHierarchy: true, strategy: "auto" /* AUTO */, chunkingStrategy: "none" /* NONE */, maxCharacters: void 0, newAfterNChars: void 0, combineTextUnderNChars: void 0, includeOriginalHtml: false, includeMetadata: true, metadataFilename: void 0, uniqueElementIds: false, processAttachments: false, attachmentPartitioningStrategy: "auto" /* AUTO */, elementTypeFilters: void 0, contentFilters: void 0, includeDebugMetadata: false, detectionOrigin: void 0, ...options }; this.cleaner = new DOMCleaner(this.options); this.classifier = new ElementClassifier(); this.contentHandlers = new ContentHandlers(); } /** * Partition HTML content into structured elements */ partition(html) { const startTime = performance.now(); const warnings = []; try { const $ = cheerio__namespace.load(html, { xmlMode: false }); this.cleaner.clean($); const elements = this.extractElements($); const processingTime = Math.max(1, Math.round(performance.now() - startTime)); const elementTypeCounts = {}; let totalTextLength = 0; let tablesExtracted = 0; let imagesExtracted = 0; let formsExtracted = 0; let linksExtracted = 0; elements.forEach((element) => { elementTypeCounts[element.type] = (elementTypeCounts[element.type] || 0) + 1; totalTextLength += element.text.length; if (element.type === "Table" /* TABLE */) tablesExtracted++; if (["Image" /* IMAGE */, "Picture" /* PICTURE */, "Figure" /* FIGURE */].includes(element.type)) imagesExtracted++; if (element.type === "Form" /* FORM */) formsExtracted++; if (element.type === "Link" /* LINK */) linksExtracted++; }); return { elements, metadata: { totalElements: elements.length, processingTime, warnings: warnings.length > 0 ? warnings : void 0, errors: void 0, elementTypeCounts, averageElementLength: elements.length > 0 ? Math.round(totalTextLength / elements.length) : 0, tablesExtracted, imagesExtracted, formsExtracted, linksExtracted, detectedLanguages: this.options.languages } }; } catch (error) { warnings.push(`Partitioning error: ${error instanceof Error ? error.message : "Unknown error"}`); return { elements: [], metadata: { totalElements: 0, processingTime: Math.max(1, Math.round(performance.now() - startTime)), warnings } }; } } /** * Extract structured elements from cleaned DOM */ extractElements($) { const elements = []; const processedElements = /* @__PURE__ */ new Set(); if ($("body").length > 0) { $("body").find("*").each((_, element) => { this.processElement($, element, processedElements, elements); }); } else { $("*").each((_, element) => { this.processElement($, element, processedElements, elements); }); } return elements; } processElement($, element, processedElements, elements) { if (processedElements.has(element)) { return; } const $el = $(element); const elementType = this.classifier.classifyElement($el); if (elementType === "UncategorizedText" /* UNCATEGORIZED_TEXT */) { processedElements.add(element); return; } let extractedElement = null; switch (elementType) { case "Table" /* TABLE */: extractedElement = this.extractTable($, $el); if (extractedElement) { $el.find("*").each((_, child) => { processedElements.add(child); }); } break; case "Image" /* IMAGE */: case "Picture" /* PICTURE */: case "Figure" /* FIGURE */: extractedElement = this.extractImage($, $el); break; case "Form" /* FORM */: if (this.options.extractForms) { extractedElement = this.extractForm($, $el); } break; case "CheckBoxChecked" /* CHECK_BOX_CHECKED */: case "CheckBoxUnchecked" /* CHECK_BOX_UNCHECKED */: extractedElement = this.extractCheckBox($, $el); break; case "RadioButtonChecked" /* RADIO_BUTTON_CHECKED */: case "RadioButtonUnchecked" /* RADIO_BUTTON_UNCHECKED */: extractedElement = this.extractRadioButton($, $el); break; case "Value" /* VALUE */: extractedElement = this.extractValue($, $el); break; case "Link" /* LINK */: if (this.options.extractLinks) { extractedElement = this.extractLink($, $el); } break; // ADDRESS and EMAIL_ADDRESS cases removed - caused false positives case "CodeSnippet" /* CODE_SNIPPET */: extractedElement = this.extractCode($, $el); break; case "Formula" /* FORMULA */: extractedElement = this.extractFormula($, $el); break; case "PageBreak" /* PAGE_BREAK */: if (this.options.includePageBreaks) { extractedElement = this.extractPageBreak($, $el); } break; default: extractedElement = this.extractTextElement($, $el, elementType); break; } processedElements.add(element); if (extractedElement) { elements.push(extractedElement); } } /** * Extract a text-based element */ extractTextElement($, $el, elementType) { const text = this.classifier.extractCleanText($el); if (text.length < this.options.minTextLength) { return null; } if (this.options.maxTextLength && text.length > this.options.maxTextLength) { return null; } const metadata = this.extractMetadata($, $el); return { id: uuid.v4(), type: elementType, text, metadata }; } /** * Extract table element with structure */ extractTable($, $el) { if (!this.options.extractTables) { return null; } if (this.isLayoutTable($, $el)) { return null; } let rows = []; let headers; const $thead = $el.find("thead tr").first(); if ($thead.length > 0) { headers = $thead.find("th, td").map((_, cell) => { return $(cell).text().trim(); }).get(); } else { const $firstRow = $el.find("tr").first(); if ($firstRow.length > 0 && $firstRow.find("th").length > 0) { headers = $firstRow.find("th, td").map((_, cell) => { return $(cell).text().trim(); }).get(); } } const $rows = $el.find("tbody tr, tr").filter((_, row) => { const isInThead = $(row).closest("thead").length > 0; const hasThElements = $(row).find("th").length > 0; return !headers || !isInThead && !hasThElements; }); $rows.each((_, row) => { const $row = $(row); const cells = $row.find("td, th").map((_2, cell) => { return $(cell).text().trim(); }).get(); if (cells.length > 0) { rows.push(cells); } }); if (!headers && rows.length > 1) { const firstRow = rows[0]; const secondRow = rows[1]; const firstRowHasAlpha = firstRow.some((cell) => /[a-zA-Z]/.test(cell)); const secondRowHasNumbers = secondRow.some((cell) => /\d/.test(cell)); if (firstRowHasAlpha && secondRowHasNumbers) { headers = firstRow; rows.shift(); } } if (headers && headers.length > 0) { const normalizedRows = rows.map((row) => { const normalizedRow = [...row]; while (normalizedRow.length < headers.length) { normalizedRow.push(""); } return normalizedRow.slice(0, headers.length); }); rows = normalizedRows; } const text = this.generateTableText(rows, headers); const metadata = this.extractMetadata($, $el); return { id: uuid.v4(), type: "Table" /* TABLE */, text, metadata, rows, headers: headers || [] }; } /** * Detect if a table is used for layout rather than data * * Note: False positives (data tables classified as layout) lose tabular structure * but preserve all content as individual elements. False negatives (layout tables * treated as data) cause massive duplication and unusable output. */ isLayoutTable($, $el) { const hasHeaders = $el.find("th").length > 0 || $el.find("thead").length > 0; const hasComplexLayout = $el.html()?.includes("colspan") || $el.html()?.includes("rowspan"); const hasLayoutAttributes = $el.attr("cellpadding") || $el.attr("cellspacing") || $el.attr("border"); const rows = $el.find("tr"); if (rows.length === 0) return true; const cellCounts = []; rows.each((_, row) => { const cellCount = $(row).find("td, th").length; cellCounts.push(cellCount); }); const uniqueCellCounts = [...new Set(cellCounts)]; const hasInconsistentColumns = uniqueCellCounts.length > 3; const totalCells = $el.find("td, th").length; const cellsWithLinks = $el.find("td a, th a").length; const cellsWithImages = $el.find("td img, th img").length; const cellsWithForms = $el.find("td form, th form, td input, th input").length; const nonTabularContent = cellsWithLinks + cellsWithImages + cellsWithForms; const hasHighNonTabularRatio = totalCells > 0 && nonTabularContent / totalCells > 0.3; if (hasLayoutAttributes && !hasHeaders && hasInconsistentColumns) { return true; } if (!hasHeaders && hasComplexLayout && hasHighNonTabularRatio) { return true; } if (!hasHeaders && hasInconsistentColumns && hasHighNonTabularRatio && rows.length > 20) { return true; } return false; } /** * Extract image element */ extractImage($, $el) { if (!this.options.extractImages) { return null; } const src = $el.attr("src"); const alt = $el.attr("alt") || ""; const width = parseInt($el.attr("width") || "0") || void 0; const height = parseInt($el.attr("height") || "0") || void 0; const text = this.options.includeImageAlt ? alt : ""; const metadata = this.extractMetadata($, $el); return { id: uuid.v4(), type: "Image" /* IMAGE */, text, metadata, src, alt, width, height }; } /** * Extract metadata from DOM element */ extractMetadata($, $el) { const tagName = $el.prop("tagName")?.toLowerCase(); const classAttr = $el.attr("class"); const cssClasses = classAttr ? classAttr.split(/\s+/).filter(Boolean) : void 0; const elementId = $el.attr("id"); const text = $el.text(); const metadata = { tagName, cssClasses: cssClasses && cssClasses.length > 0 ? cssClasses : void 0, elementId: elementId && elementId.trim() !== "" ? elementId : void 0, textLength: text.length }; if (this.options.extractLinks) { const links = this.contentHandlers.extractLinks($, $el); if (links.length > 0) { metadata.links = links; metadata.linkTexts = links.map((link) => link.text); metadata.linkUrls = links.map((link) => link.url); } } const emphasis = this.contentHandlers.extractEmphasis($, $el); if (emphasis.contents.length > 0) { metadata.emphasizedTextContents = emphasis.contents; metadata.emphasizedTextTags = emphasis.tags; } if (this.options.includeCoordinates) { const coordinates = this.contentHandlers.extractCoordinates($el); if (coordinates) { metadata.coordinates = coordinates; } } if (this.options.includeOriginalHtml) { metadata.originalHtml = $.html($el) || void 0; } return metadata; } /** * Extract form element with fields */ extractForm($, $el) { const fields = this.options.extractFormFields ? this.contentHandlers.extractFormFields($, $el) : []; const text = fields.length > 0 ? fields.map((f) => `${f.fieldName}: ${f.fieldValue}`).join("; ") : $el.text().trim() || "Form"; const metadata = this.extractMetadata($, $el); return { id: uuid.v4(), type: "Form" /* FORM */, text, metadata, fields: this.options.extractFormFields ? fields : void 0 }; } /** * Extract checkbox element */ extractCheckBox($, $el) { const checked = Boolean($el.prop("checked")); const value = $el.attr("value") || ""; const label = $el.prev("label").text().trim() || $el.next("label").text().trim() || ""; const text = label || (checked ? "checked" : "unchecked"); const metadata = this.extractMetadata($, $el); return { id: uuid.v4(), type: checked ? "CheckBoxChecked" /* CHECK_BOX_CHECKED */ : "CheckBoxUnchecked" /* CHECK_BOX_UNCHECKED */, text, metadata, checked, value: value || void 0 }; } /** * Extract radio button element */ extractRadioButton($, $el) { const checked = Boolean($el.prop("checked")); const value