magnitude-extract
Version:
TypeScript DOM cleaning and structuring library
1,460 lines (1,453 loc) • 70.8 kB
JavaScript
#!/usr/bin/env node
'use strict';
var cheerio = require('cheerio');
var uuid = require('uuid');
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
function _interopNamespace(e) {
if (e && e.__esModule) return e;
var n = Object.create(null);
if (e) {
Object.keys(e).forEach(function (k) {
if (k !== 'default') {
var d = Object.getOwnPropertyDescriptor(e, k);
Object.defineProperty(n, k, d.get ? d : {
enumerable: true,
get: function () { return e[k]; }
});
}
});
}
n.default = e;
return Object.freeze(n);
}
var cheerio__namespace = /*#__PURE__*/_interopNamespace(cheerio);
// src/mappings.ts
var TAG_TO_ELEMENT_TYPE = {
// Headings and titles
"h1": "Title" /* TITLE */,
"h2": "Title" /* TITLE */,
"h3": "Title" /* TITLE */,
"h4": "Title" /* TITLE */,
"h5": "Title" /* TITLE */,
"h6": "Title" /* TITLE */,
"title": "Title" /* TITLE */,
// Document title
// Text content
"p": "NarrativeText" /* NARRATIVE_TEXT */,
"div": "Text" /* TEXT */,
// Will be refined by content analysis
"span": "Text" /* TEXT */,
"article": "NarrativeText" /* NARRATIVE_TEXT */,
"section": "NarrativeText" /* NARRATIVE_TEXT */,
"main": "NarrativeText" /* NARRATIVE_TEXT */,
"aside": "Text" /* TEXT */,
// Specialized text elements
"blockquote": "NarrativeText" /* NARRATIVE_TEXT */,
"q": "NarrativeText" /* NARRATIVE_TEXT */,
"cite": "NarrativeText" /* NARRATIVE_TEXT */,
"abbr": "Text" /* TEXT */,
"acronym": "Text" /* TEXT */,
"dfn": "Text" /* TEXT */,
"time": "Text" /* TEXT */,
// Lists
"li": "ListItem" /* LIST_ITEM */,
"ul": "List" /* LIST */,
"ol": "List" /* LIST */,
"dl": "List" /* LIST */,
"dt": "ListItem" /* LIST_ITEM */,
"dd": "ListItem" /* LIST_ITEM */,
// Tables
"table": "Table" /* TABLE */,
"thead": "Table" /* TABLE */,
"tbody": "Table" /* TABLE */,
"tfoot": "Table" /* TABLE */,
"tr": "Table" /* TABLE */,
"td": "Table" /* TABLE */,
"th": "Table" /* TABLE */,
"caption": "Caption" /* CAPTION */,
"colgroup": "Table" /* TABLE */,
"col": "Table" /* TABLE */,
// Media elements
"img": "Image" /* IMAGE */,
"figure": "Figure" /* FIGURE */,
"picture": "Picture" /* PICTURE */,
"figcaption": "FigureCaption" /* FIGURE_CAPTION */,
"video": "Image" /* IMAGE */,
// Treat as media element
"audio": "Image" /* IMAGE */,
// Treat as media element
"canvas": "Image" /* IMAGE */,
"svg": "Image" /* IMAGE */,
// Code elements
"code": "CodeSnippet" /* CODE_SNIPPET */,
"pre": "CodeSnippet" /* CODE_SNIPPET */,
"kbd": "CodeSnippet" /* CODE_SNIPPET */,
"samp": "CodeSnippet" /* CODE_SNIPPET */,
"var": "CodeSnippet" /* CODE_SNIPPET */,
// Navigation (usually filtered)
"nav": "Navigation" /* NAVIGATION */,
"menu": "Navigation" /* NAVIGATION */,
"menuitem": "Navigation" /* NAVIGATION */,
// Headers/Footers
"header": "Header" /* HEADER */,
"footer": "Footer" /* FOOTER */,
// Forms - only form itself should be mapped directly, others handled by specialized logic
"form": "Form" /* FORM */,
"label": "FieldName" /* FIELD_NAME */,
"legend": "FieldName" /* FIELD_NAME */,
"option": "Value" /* VALUE */,
"output": "Value" /* VALUE */,
"progress": "Value" /* VALUE */,
"meter": "Value" /* VALUE */,
// Contact information (removed - caused false positives)
// Links
"a": "Link" /* LINK */,
// Document structure
"hr": "PageBreak" /* PAGE_BREAK */,
"br": "Text" /* TEXT */,
// Line break, usually part of text
// Mathematical content
"math": "Formula" /* FORMULA */,
"mrow": "Formula" /* FORMULA */,
"mi": "Formula" /* FORMULA */,
"mn": "Formula" /* FORMULA */,
"mo": "Formula" /* FORMULA */,
"mfrac": "Formula" /* FORMULA */,
"msup": "Formula" /* FORMULA */,
"msub": "Formula" /* FORMULA */,
"msubsup": "Formula" /* FORMULA */,
"munder": "Formula" /* FORMULA */,
"mover": "Formula" /* FORMULA */,
"munderover": "Formula" /* FORMULA */,
"msqrt": "Formula" /* FORMULA */,
"mroot": "Formula" /* FORMULA */,
"mtext": "Formula" /* FORMULA */,
"mspace": "Formula" /* FORMULA */,
"mstyle": "Formula" /* FORMULA */,
"merror": "Formula" /* FORMULA */,
"mpadded": "Formula" /* FORMULA */,
"mphantom": "Formula" /* FORMULA */,
"mfenced": "Formula" /* FORMULA */,
"menclose": "Formula" /* FORMULA */,
"mtable": "Formula" /* FORMULA */,
"mtr": "Formula" /* FORMULA */,
"mtd": "Formula" /* FORMULA */,
"maligngroup": "Formula" /* FORMULA */,
"malignmark": "Formula" /* FORMULA */,
"mlabeledtr": "Formula" /* FORMULA */,
"maction": "Formula" /* FORMULA */,
"semantics": "Formula" /* FORMULA */,
"annotation": "Formula" /* FORMULA */,
"annotation-xml": "Formula" /* FORMULA */
};
var CSS_CLASS_PATTERNS = [
// Navigation patterns
{ pattern: /\b(nav|menu|breadcrumb|sidebar|navigation|navbar|menubar)\b/i, elementType: "Navigation" /* NAVIGATION */ },
// Header/Footer patterns
{ pattern: /\b(header|masthead|banner|site-header|page-header|main-header)\b/i, elementType: "PageHeader" /* PAGE_HEADER */ },
{ pattern: /\b(footer|copyright|legal|site-footer|page-footer|main-footer)\b/i, elementType: "PageFooter" /* PAGE_FOOTER */ },
{ pattern: /\b(section-header|content-header)\b/i, elementType: "SectionHeader" /* SECTION_HEADER */ },
// Title and heading patterns
{ pattern: /\b(title|heading|headline|h[1-6]|header-text)\b/i, elementType: "Title" /* TITLE */ },
{ pattern: /\b(subtitle|subheading|sub-title|sub-heading)\b/i, elementType: "Subheadline" /* SUB_HEADLINE */ },
// Content patterns
{ pattern: /\b(content|article|post|story|narrative|text|body|main-content)\b/i, elementType: "NarrativeText" /* NARRATIVE_TEXT */ },
{ pattern: /\b(paragraph|para|text-block)\b/i, elementType: "Paragraph" /* PARAGRAPH */ },
{ pattern: /\b(abstract|summary|synopsis)\b/i, elementType: "Abstract" /* ABSTRACT */ },
// List patterns
{ pattern: /\b(list|item|bullet|numbered|ordered|unordered)\b/i, elementType: "ListItem" /* LIST_ITEM */ },
// Form patterns - only match actual form containers, not styling divs
{ pattern: /\b(form-container|form-wrapper|contact-form|login-form|signup-form)\b/i, elementType: "Form" /* FORM */ },
{ pattern: /\b(label|field-name|form-label)\b/i, elementType: "FieldName" /* FIELD_NAME */ },
{ pattern: /\b(value|field-value|input-value)\b/i, elementType: "Value" /* VALUE */ },
// Table patterns
{ pattern: /\b(table|grid|data|tabular|spreadsheet)\b/i, elementType: "Table" /* TABLE */ },
// Media patterns
{ pattern: /\b(image|img|picture|photo|figure|media)\b/i, elementType: "Image" /* IMAGE */ },
{ pattern: /\b(caption|img-caption|figure-caption|photo-caption)\b/i, elementType: "FigureCaption" /* FIGURE_CAPTION */ },
// Code patterns
{ pattern: /\b(code|highlight|syntax|language-|hljs|prettyprint|source|snippet)\b/i, elementType: "CodeSnippet" /* CODE_SNIPPET */ },
// Address and email patterns removed - caused false positives
// Mathematical patterns
{ pattern: /\b(math|latex|katex|mathjax|formula|equation)\b/i, elementType: "Formula" /* FORMULA */ },
// Footnote patterns
{ pattern: /\b(footnote|endnote|note|reference)\b/i, elementType: "Footnote" /* FOOTNOTE */ },
// Page number patterns
{ pattern: /\b(page-number|pagination|page-info)\b/i, elementType: "PageNumber" /* PAGE_NUMBER */ },
// Link patterns
{ pattern: /\b(link|hyperlink|url|href)\b/i, elementType: "Link" /* LINK */ }
];
var IGNORED_TAGS = /* @__PURE__ */ new Set([
"script",
"style",
"meta",
"link",
"noscript",
"iframe",
// we expand these automatically anyway before processing
"object",
"embed",
"applet"
]);
var NAVIGATION_TAGS = /* @__PURE__ */ new Set([
"nav",
"menu",
"menuitem",
"aside"
// Often sidebars
]);
var INLINE_TAGS = /* @__PURE__ */ new Set([
"a",
"strong",
"b",
"em",
"i",
"u",
"span",
"code",
"kbd",
"samp",
"var",
"mark",
"small",
"sub",
"sup"
]);
// src/cleaner.ts
var DOMCleaner = class {
constructor(options = {}) {
this.options = options;
}
/**
* Clean and normalize HTML content
*/
clean($) {
this.removeIgnoredTags($);
if (this.options.skipNavigation) {
this.removeNavigationElements($);
}
if (this.options.skipHeaders) {
this.removeHeaders($);
}
if (this.options.skipFooters) {
this.removeFooters($);
}
if (this.options.skipHeadersAndFooters) {
this.removeHeaders($);
this.removeFooters($);
}
if (this.options.skipForms) {
this.removeForms($);
}
this.normalizeWhitespace($);
this.removeEmptyElements($);
return $;
}
/**
* Remove script, style, and other ignored tags
*/
removeIgnoredTags($) {
IGNORED_TAGS.forEach((tag) => {
$(tag).remove();
});
$("*").contents().filter(function() {
return this.type === "comment";
}).remove();
}
/**
* Remove navigation elements
*/
removeNavigationElements($) {
NAVIGATION_TAGS.forEach((tag) => {
$(tag).remove();
});
const navSelectors = [
'[class*="nav"]',
'[class*="menu"]',
'[class*="breadcrumb"]',
'[class*="sidebar"]',
'[id*="nav"]',
'[id*="menu"]',
'[role="navigation"]',
'[role="menu"]',
'[role="menubar"]'
];
navSelectors.forEach((selector) => {
$(selector).remove();
});
}
/**
* Remove header elements
*/
removeHeaders($) {
const headerSelectors = [
"header",
'[class*="header"]',
'[class*="masthead"]',
'[class*="banner"]',
'[id*="header"]',
'[role="banner"]'
];
headerSelectors.forEach((selector) => {
$(selector).remove();
});
}
/**
* Remove footer elements
*/
removeFooters($) {
const footerSelectors = [
"footer",
'[class*="footer"]',
'[class*="copyright"]',
'[class*="legal"]',
'[id*="footer"]',
'[role="contentinfo"]'
];
footerSelectors.forEach((selector) => {
$(selector).remove();
});
}
/**
* Remove form elements
*/
removeForms($) {
$("form, input, button, select, textarea").remove();
}
/**
* Normalize whitespace in text content
*/
normalizeWhitespace($) {
if (this.options.preserveWhitespace) {
return;
}
$("*").contents().filter(function() {
return this.type === "text";
}).each((_, node) => {
if ("data" in node && node.data) {
node.data = node.data.replace(/\\s+/g, " ");
}
});
}
/**
* Remove empty elements that don't contribute content
*/
removeEmptyElements($) {
$("*").each((_, element) => {
const $el = $(element);
const text = $el.text().trim();
const tagName = element.tagName;
const hasImages = $el.find("img").length > 0;
const hasTables = $el.find("table").length > 0;
const hasInputs = $el.find("input, button, select, textarea").length > 0;
const isHeading = /^h[1-6]$/i.test(tagName);
const isTableCell = /^(td|th)$/i.test(tagName);
const isImage = tagName === "img";
const isFormElement = /^(input|button|select|textarea|form|label|fieldset|legend)$/i.test(tagName);
const isTable = tagName === "table";
const shouldKeep = text.length >= (this.options.minTextLength || 3) || isHeading || isTableCell || isImage && this.options.extractImages || hasImages && this.options.extractImages || hasTables && this.options.extractTables || isTable && this.options.extractTables || hasInputs && !this.options.skipForms || isFormElement && !this.options.skipForms;
if (!shouldKeep && !this.hasSignificantChildren($el)) {
$el.remove();
}
});
}
/**
* Check if element has children that should be preserved
*/
hasSignificantChildren($el) {
const significantTags = ["img", "table", "h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "form", "input", "textarea", "select", "button", "td", "th", "tr"];
return significantTags.some((tag) => $el.find(tag).length > 0);
}
};
var ElementClassifier = class {
/**
* Classify a DOM element based on its tag, attributes, and content
*/
classifyElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
if (!tagName) {
return "UncategorizedText" /* UNCATEGORIZED_TEXT */;
}
const tagType = TAG_TO_ELEMENT_TYPE[tagName];
if (tagType && ["table", "form"].includes(tagName)) {
return tagType;
}
const specializedType = this.classifySpecializedElement($el);
if (specializedType !== "UncategorizedText" /* UNCATEGORIZED_TEXT */) {
return specializedType;
}
if (tagType && tagType !== "Text" /* TEXT */) {
return tagType;
}
const classType = this.classifyByCSS($el);
if (classType !== "UncategorizedText" /* UNCATEGORIZED_TEXT */) {
return classType;
}
return this.classifyByContent($el);
}
/**
* Classify specialized elements (forms, addresses, emails, code, etc.)
*/
classifySpecializedElement($el) {
if (this.isFormElement($el)) {
return this.classifyFormElement($el);
}
if (this.isCodeElement($el)) {
return "CodeSnippet" /* CODE_SNIPPET */;
}
if (this.isPageNumberElement($el)) {
return "PageNumber" /* PAGE_NUMBER */;
}
if (this.isAbstractElement($el)) {
return "Abstract" /* ABSTRACT */;
}
const headerFooterType = this.classifyHeaderFooter($el);
if (headerFooterType !== "UncategorizedText" /* UNCATEGORIZED_TEXT */) {
return headerFooterType;
}
return "UncategorizedText" /* UNCATEGORIZED_TEXT */;
}
/**
* Check if element is a form-related element
*/
isFormElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const formTags = ["form", "input", "textarea", "select", "button", "fieldset", "legend", "label"];
const type = $el.attr("type")?.toLowerCase();
return formTags.includes(tagName || "") || tagName === "input" && ["checkbox", "radio", "submit", "button"].includes(type || "");
}
/**
* Classify form elements into specific types
*/
classifyFormElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const type = $el.attr("type")?.toLowerCase();
const checked = $el.prop("checked");
switch (tagName) {
case "form":
return "Form" /* FORM */;
case "input":
switch (type) {
case "checkbox":
return checked ? "CheckBoxChecked" /* CHECK_BOX_CHECKED */ : "CheckBoxUnchecked" /* CHECK_BOX_UNCHECKED */;
case "radio":
return checked ? "RadioButtonChecked" /* RADIO_BUTTON_CHECKED */ : "RadioButtonUnchecked" /* RADIO_BUTTON_UNCHECKED */;
default:
return "Value" /* VALUE */;
}
case "label":
return "FieldName" /* FIELD_NAME */;
case "textarea":
case "select":
return "Value" /* VALUE */;
case "fieldset":
return "Text" /* TEXT */;
case "legend":
return "FieldName" /* FIELD_NAME */;
case "button":
return "Value" /* VALUE */;
default:
if (tagName === "form") {
return "Form" /* FORM */;
}
return "Text" /* TEXT */;
}
}
/**
* Check if element contains code
*/
isCodeElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const className = $el.attr("class") || "";
const codeTags = ["code", "pre", "kbd", "samp", "var"];
const codeClasses = /\b(code|highlight|syntax|language-|hljs|prettyprint)\b/i;
return codeTags.includes(tagName || "") || codeClasses.test(className);
}
/**
* Check if element contains an address
*/
isAddressElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const text = $el.text().trim();
const className = $el.attr("class") || "";
if (tagName === "address") {
return true;
}
const addressClasses = /\b(address|location|postal)\b/i;
const formClasses = /\b(form|contact-form|login-form|signup-form)\b/i;
if (addressClasses.test(className) && !formClasses.test(className)) {
return true;
}
if (/\bcontact\b/i.test(className) && !/form/i.test(className)) {
return true;
}
const addressPatterns = [
/\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl)/i,
/\b\d{5}(?:-\d{4})?\b/,
// ZIP codes
/\b[A-Z]{2}\s+\d{5}\b/
// State + ZIP
];
return addressPatterns.some((pattern) => pattern.test(text));
}
/**
* Check if element contains an email address
*/
isEmailAddressElement($el) {
const text = $el.text().trim();
const emailPattern = /\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b/;
return emailPattern.test(text);
}
/**
* Check if element contains a mathematical formula
*/
isFormulaElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const className = $el.attr("class") || "";
const text = $el.text().trim();
const mathTags = ["math", "mrow", "mi", "mn", "mo", "mfrac", "msup", "msub"];
if (mathTags.includes(tagName || "")) {
return true;
}
const mathClasses = /\b(math|latex|katex|mathjax|formula|equation)\b/i;
if (mathClasses.test(className)) {
return true;
}
const mathPatterns = [
/[∑∏∫∂∇∆√∞±≤≥≠≈∈∉⊂⊃∪∩]/,
/\$.*\$/,
// LaTeX delimiters
/\\[a-zA-Z]+\{.*\}/
// LaTeX commands
];
return mathPatterns.some((pattern) => pattern.test(text));
}
/**
* Check if element is a caption
*/
isCaptionElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const className = $el.attr("class") || "";
const captionTags = ["caption", "figcaption"];
const captionClasses = /\b(caption|figure-caption|img-caption)\b/i;
return captionTags.includes(tagName || "") || captionClasses.test(className);
}
/**
* Classify caption elements
*/
classifyCaptionElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
if (tagName === "figcaption") {
return "FigureCaption" /* FIGURE_CAPTION */;
}
const parent = $el.parent();
const parentTag = parent.prop("tagName")?.toLowerCase();
if (parentTag === "figure" || parent.find("img").length > 0) {
return "FigureCaption" /* FIGURE_CAPTION */;
}
return "Caption" /* CAPTION */;
}
/**
* Check if element is a footnote
*/
isFootnoteElement($el) {
const className = $el.attr("class") || "";
const id = $el.attr("id") || "";
const text = $el.text().trim();
const footnoteClasses = /\b(footnote|endnote|note)\b/i;
const footnoteIds = /\b(fn|footnote|note)-?\d+\b/i;
const footnotePatterns = /^\[\d+\]|\(\d+\)|^\d+\./;
return footnoteClasses.test(className) || footnoteIds.test(id) || footnotePatterns.test(text);
}
/**
* Check if element contains a page number
*/
isPageNumberElement($el) {
const text = $el.text().trim();
const className = $el.attr("class") || "";
const pageClasses = /\b(page-?number|pagination)\b/i;
const pagePatterns = [
/^Page\s+\d+$/i,
/^\d+\s+of\s+\d+$/i,
/^\d+\s*\/\s*\d+$/,
/^-?\s*\d+\s*-?$/
];
return pageClasses.test(className) || pagePatterns.some((pattern) => pattern.test(text));
}
/**
* Check if element is an abstract
*/
isAbstractElement($el) {
const className = $el.attr("class") || "";
const id = $el.attr("id") || "";
const text = $el.text().trim();
const abstractClasses = /\b(abstract|summary)\b/i;
const abstractIds = /\babstract\b/i;
const abstractStart = /^abstract\b/i;
return abstractClasses.test(className) || abstractIds.test(id) || abstractStart.test(text);
}
/**
* Classify header and footer elements
*/
classifyHeaderFooter($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const className = $el.attr("class") || "";
const id = $el.attr("id") || "";
if (tagName === "header") {
return this.isPageLevelHeader($el) ? "PageHeader" /* PAGE_HEADER */ : "Header" /* HEADER */;
}
if (tagName === "footer") {
return this.isPageLevelFooter($el) ? "PageFooter" /* PAGE_FOOTER */ : "Footer" /* FOOTER */;
}
const headerClasses = /\b(header|masthead|banner)\b/i;
const footerClasses = /\b(footer|contentinfo)\b/i;
const pageHeaderClasses = /\b(page-header|site-header|main-header)\b/i;
const pageFooterClasses = /\b(page-footer|site-footer|main-footer)\b/i;
const combinedClasses = `${className} ${id}`;
if (pageHeaderClasses.test(combinedClasses)) {
return "PageHeader" /* PAGE_HEADER */;
}
if (pageFooterClasses.test(combinedClasses)) {
return "PageFooter" /* PAGE_FOOTER */;
}
if (headerClasses.test(combinedClasses)) {
return "Header" /* HEADER */;
}
if (footerClasses.test(combinedClasses)) {
return "Footer" /* FOOTER */;
}
return "UncategorizedText" /* UNCATEGORIZED_TEXT */;
}
/**
* Check if header is page-level
*/
isPageLevelHeader($el) {
const parent = $el.parent();
const parentTag = parent.prop("tagName")?.toLowerCase();
return ["body", "main", "html"].includes(parentTag || "") || parent.hasClass("page") || parent.hasClass("container") || parent.hasClass("wrapper");
}
/**
* Check if footer is page-level
*/
isPageLevelFooter($el) {
const parent = $el.parent();
const parentTag = parent.prop("tagName")?.toLowerCase();
return ["body", "main", "html"].includes(parentTag || "") || parent.hasClass("page") || parent.hasClass("container") || parent.hasClass("wrapper");
}
/**
* Classify element based on CSS classes
*/
classifyByCSS($el) {
const className = $el.attr("class") || "";
const id = $el.attr("id") || "";
const combinedClasses = `${className} ${id}`.toLowerCase();
for (const { pattern, elementType } of CSS_CLASS_PATTERNS) {
if (pattern.test(combinedClasses)) {
return elementType;
}
}
return "UncategorizedText" /* UNCATEGORIZED_TEXT */;
}
/**
* Classify element based on content analysis
*/
classifyByContent($el) {
const text = $el.text().trim();
const tagName = $el.prop("tagName")?.toLowerCase();
if (text.length < 3) {
return "UncategorizedText" /* UNCATEGORIZED_TEXT */;
}
if (this.looksLikeTitle(text, $el)) {
return "Title" /* TITLE */;
}
if (this.looksLikeListItem(text, $el)) {
return "ListItem" /* LIST_ITEM */;
}
switch (tagName) {
case "div":
case "section":
case "article":
return text.length > 50 ? "NarrativeText" /* NARRATIVE_TEXT */ : "Text" /* TEXT */;
case "span":
return "Text" /* TEXT */;
case "p":
return "NarrativeText" /* NARRATIVE_TEXT */;
default:
return "Text" /* TEXT */;
}
}
/**
* Heuristics to determine if text looks like a title/heading
*/
looksLikeTitle(text, $el) {
if (text.length > 100) {
return false;
}
const hasCapitalization = /^[A-Z]/.test(text) && text === text.charAt(0).toUpperCase() + text.slice(1);
const hasColonOrDash = /[:\\-–—]/.test(text);
const isAllCaps = text === text.toUpperCase() && text.length > 3;
const hasNumbers = /^\\d+\\.?\\s/.test(text);
const parent = $el.parent();
const parentTag = parent.prop("tagName")?.toLowerCase();
const isInHeader = parentTag === "header" || parent.closest("header").length > 0;
const style = $el.attr("style") || "";
const hasBoldStyle = /font-weight\\s*:\\s*(bold|[6-9]00)/i.test(style);
const hasLargeFont = /font-size\\s*:\\s*([2-9]\\d|\\d{3,})px/i.test(style);
return hasCapitalization || hasColonOrDash || isAllCaps || hasNumbers || isInHeader || hasBoldStyle || hasLargeFont;
}
/**
* Heuristics to determine if text looks like a list item
*/
looksLikeListItem(text, $el) {
const bulletPatterns = [
/^[•·▪▫‣⁃]\\s/,
// Unicode bullets
/^[-*+]\\s/,
// ASCII bullets
/^\\d+\\.\\s/,
// Numbered lists
/^[a-zA-Z]\\.\\s/,
// Lettered lists
/^\\([a-zA-Z0-9]+\\)\\s/
// Parenthetical lists
];
const startsWithBullet = bulletPatterns.some((pattern) => pattern.test(text));
const parent = $el.parent();
const parentTag = parent.prop("tagName")?.toLowerCase();
const isInList = ["ul", "ol", "dl"].includes(parentTag || "");
const siblings = parent.children().not($el);
const siblingTexts = siblings.map((_, el) => cheerio__namespace.load(el).text().trim()).get();
const siblingsWithBullets = siblingTexts.filter(
(siblingText) => bulletPatterns.some((pattern) => pattern.test(siblingText))
).length;
const mostSiblingsAreBullets = siblingsWithBullets > siblings.length * 0.5;
return startsWithBullet || isInList || mostSiblingsAreBullets;
}
/**
* Check if element should be treated as inline (part of parent's text)
*/
isInlineElement($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
return INLINE_TAGS.has(tagName || "");
}
/**
* Extract clean text from element, handling inline elements appropriately
*/
extractCleanText($el) {
const $clone = $el.clone();
$clone.find("*").each((_, child) => {
const $child = cheerio__namespace.load(child);
const childEl = $child.root().children().first();
if (this.isInlineElement(childEl)) {
childEl.replaceWith(childEl.text());
}
});
return $clone.text().replace(/\\s+/g, " ").trim();
}
};
// src/content-handlers.ts
var ContentHandlers = class {
/**
* Extract form fields and their values
*/
extractFormFields($, $form) {
const fields = [];
$form.find("input, textarea, select").each((_, element) => {
const $el = $(element);
const fieldType = $el.attr("type") || $el.prop("tagName")?.toLowerCase();
const fieldName = $el.attr("name") || $el.attr("id") || $el.prev("label").text().trim() || "field";
let fieldValue = "";
switch (fieldType) {
case "checkbox":
case "radio":
fieldValue = $el.prop("checked") ? $el.attr("value") || "checked" : "unchecked";
break;
case "select":
fieldValue = $el.find("option:selected").text() || $el.find("option").first().text();
break;
default:
fieldValue = $el.attr("value") || $el.text().trim();
}
{
fields.push({
fieldName,
fieldValue,
fieldType
});
}
});
$form.find("label").each((_, element) => {
const $label = $(element);
const forAttr = $label.attr("for");
const labelText = $label.text().trim();
if (forAttr && labelText) {
const $input = $form.find(`#${forAttr}`);
if ($input.length > 0) {
const existingField = fields.find((f) => f.fieldName === labelText);
if (!existingField) {
const fieldType = $input.attr("type") || $input.prop("tagName")?.toLowerCase();
let fieldValue = "";
switch (fieldType) {
case "checkbox":
case "radio":
fieldValue = $input.prop("checked") ? $input.attr("value") || "checked" : "unchecked";
break;
default:
fieldValue = $input.attr("value") || $input.text().trim();
}
fields.push({
fieldName: labelText,
fieldValue,
fieldType
});
}
}
}
});
return fields;
}
/**
* Extract links with metadata
*/
extractLinks($, $el) {
const links = [];
$el.find("a[href]").each((_, element) => {
const $link = $(element);
const href = $link.attr("href");
const text = $link.text().trim();
if (href && text) {
links.push({
text,
url: href,
startIndex: 0
// Would need more complex text analysis to determine actual position
});
}
});
return links;
}
/**
* Parse address components
*/
parseAddress(text) {
const components = {};
const streetMatch = text.match(/(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl))/i);
if (streetMatch) {
components.street = streetMatch[1].trim();
}
const zipMatch = text.match(/\b(\d{5}(?:-\d{4})?)\b/);
if (zipMatch) {
components.zipCode = zipMatch[1];
}
const stateZipMatch = text.match(/\b([A-Z]{2})\s+(\d{5}(?:-\d{4})?)\b/);
if (stateZipMatch) {
components.state = stateZipMatch[1];
components.zipCode = stateZipMatch[2];
}
const cityMatch = text.match(/,\s*([A-Za-z\s]+),?\s*[A-Z]{2}\s*\d{5}/);
if (cityMatch) {
components.city = cityMatch[1].trim();
}
return Object.keys(components).length > 0 ? components : void 0;
}
/**
* Extract email addresses from text
*/
extractEmailAddresses(text) {
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
return text.match(emailPattern) || [];
}
/**
* Detect programming language in code blocks
*/
detectCodeLanguage($el) {
const className = $el.attr("class") || "";
const languagePatterns = [
{ pattern: /language-(\w+)/i, group: 1 },
{ pattern: /lang-(\w+)/i, group: 1 },
{ pattern: /highlight-(\w+)/i, group: 1 },
{ pattern: /\b(javascript|js|typescript|ts|python|java|cpp|c\+\+|csharp|c#|php|ruby|go|rust|swift|kotlin|scala|html|css|sql|bash|shell|powershell|yaml|json|xml)\b/i, group: 1 }
];
for (const { pattern, group } of languagePatterns) {
const match = className.match(pattern);
if (match && match[group]) {
return match[group].toLowerCase();
}
}
const dataLang = $el.attr("data-language") || $el.attr("data-lang");
if (dataLang) {
return dataLang.toLowerCase();
}
const text = $el.text();
return this.detectLanguageFromContent(text);
}
/**
* Detect programming language from code content
*/
detectLanguageFromContent(code) {
const languagePatterns = [
{ language: "javascript", patterns: [/\b(function|const|let|var|=>|console\.log)\b/, /\$\(.*\)/, /require\(.*\)/] },
{ language: "typescript", patterns: [/\b(interface|type|enum)\b/, /:\s*(string|number|boolean)/, /\bas\s+\w+/] },
{ language: "python", patterns: [/\b(def|import|from|class|if __name__)\b/, /print\(/, /\bself\b/] },
{ language: "java", patterns: [/\b(public|private|class|static|void)\b/, /System\.out\.println/, /\bString\[\]/] },
{ language: "cpp", patterns: [/\b(#include|using namespace|std::)\b/, /cout\s*<</, /\bint main\b/] },
{ language: "csharp", patterns: [/\b(using|namespace|public class)\b/, /Console\.WriteLine/, /\bstring\[\]/] },
{ language: "php", patterns: [/<\?php/, /\$\w+/, /echo\s+/] },
{ language: "ruby", patterns: [/\b(def|end|class|require)\b/, /puts\s+/, /\@\w+/] },
{ language: "go", patterns: [/\b(package|import|func|var)\b/, /fmt\.Print/, /\bgo\s+\w+/] },
{ language: "rust", patterns: [/\b(fn|let|mut|use|struct)\b/, /println!/, /\bSome\(|\bNone\b/] },
{ language: "html", patterns: [/<\/?[a-z][\s\S]*>/i, /<!DOCTYPE/, /&\w+;/] },
{ language: "css", patterns: [/\{[^}]*\}/, /\.[a-zA-Z][\w-]*\s*\{/, /@media\s+/] },
{ language: "sql", patterns: [/\b(SELECT|FROM|WHERE|INSERT|UPDATE|DELETE)\b/i, /\bJOIN\b/i, /\bGROUP BY\b/i] },
{ language: "bash", patterns: [/^#!/, /\$\w+/, /\becho\s+/, /\|\s*\w+/] },
{ language: "json", patterns: [/^\s*\{[\s\S]*\}\s*$/, /"\w+":\s*/, /\[\s*\{/] },
{ language: "xml", patterns: [/<\?xml/, /<\/\w+>/, /xmlns:/] },
{ language: "yaml", patterns: [/^\s*\w+:\s*/, /^---/, /^\s*-\s+/] }
];
for (const { language, patterns } of languagePatterns) {
if (patterns.some((pattern) => pattern.test(code))) {
return language;
}
}
return void 0;
}
/**
* Extract mathematical formula type
*/
detectFormulaType($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const text = $el.text();
if (tagName && ["math", "mrow", "mi", "mn", "mo"].includes(tagName)) {
return "mathml";
}
if (/\$.*\$|\\[a-zA-Z]+\{.*\}|\\begin\{.*\}/.test(text)) {
return "latex";
}
return "text";
}
/**
* Extract coordinates from element positioning
*/
extractCoordinates($el) {
const style = $el.attr("style") || "";
const position = {};
const topMatch = style.match(/top:\s*(\d+)px/);
const leftMatch = style.match(/left:\s*(\d+)px/);
const widthMatch = style.match(/width:\s*(\d+)px/);
const heightMatch = style.match(/height:\s*(\d+)px/);
if (topMatch) position.top = parseInt(topMatch[1]);
if (leftMatch) position.left = parseInt(leftMatch[1]);
if (widthMatch) position.width = parseInt(widthMatch[1]);
if (heightMatch) position.height = parseInt(heightMatch[1]);
if (position.top !== void 0 && position.left !== void 0) {
const points = [
{ x: position.left, y: position.top },
{ x: position.left + (position.width || 0), y: position.top },
{ x: position.left + (position.width || 0), y: position.top + (position.height || 0) },
{ x: position.left, y: position.top + (position.height || 0) }
];
return {
points,
system: {
width: 1920,
// Default viewport width
height: 1080,
// Default viewport height
coordinateUnit: "pixels"
}
};
}
return void 0;
}
/**
* Extract emphasized text and their tags
*/
extractEmphasis($, $el) {
const contents = [];
const tags = [];
const emphasisTags = ["strong", "b", "em", "i", "u", "mark", "ins", "del"];
emphasisTags.forEach((tag) => {
$el.find(tag).each((_, element) => {
const $emphEl = $(element);
const text = $emphEl.text().trim();
if (text) {
contents.push(text);
tags.push(tag);
}
});
});
return { contents, tags };
}
/**
* Detect if text contains page break indicators
*/
isPageBreak($el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const className = $el.attr("class") || "";
const style = $el.attr("style") || "";
if (tagName === "hr") {
return true;
}
const pageBreakClasses = /\b(page-break|pagebreak|new-page)\b/i;
const pageBreakStyles = /page-break-(before|after):\s*(always|page)/i;
return pageBreakClasses.test(className) || pageBreakStyles.test(style);
}
/**
* Extract table structure with headers and data
*/
extractTableStructure($, $table) {
const headers = [];
const rows = [];
const $thead = $table.find("thead");
if ($thead.length > 0) {
$thead.find("tr").first().find("th, td").each((_, cell) => {
headers.push($(cell).text().trim());
});
} else {
const $firstRow = $table.find("tr").first();
const $thCells = $firstRow.find("th");
if ($thCells.length > 0) {
$thCells.each((_, cell) => {
headers.push($(cell).text().trim());
});
}
}
const $dataRows = headers.length > 0 ? $table.find("tbody tr, tr").not($table.find("thead tr")) : $table.find("tr");
$dataRows.each((_, row) => {
const $row = $(row);
const rowData = [];
$row.find("td, th").each((_2, cell) => {
rowData.push($(cell).text().trim());
});
if (rowData.length > 0) {
rows.push(rowData);
}
});
if (headers.length === 0 && rows.length > 0) {
headers.push(...rows.shift());
}
return {
headers: headers.length > 0 ? headers : void 0,
rows
};
}
};
// src/partitioner.ts
var DOMPartitioner = class {
// For parent-child relationships
constructor(options = {}) {
this.elementIdMap = /* @__PURE__ */ new Map();
this.options = {
skipNavigation: true,
skipHeaders: false,
skipFooters: false,
skipForms: false,
skipHeadersAndFooters: false,
minTextLength: 3,
maxTextLength: void 0,
preserveWhitespace: false,
extractTables: true,
inferTableStructure: true,
skipInferTableTypes: [],
extractImages: true,
includeImageAlt: true,
extractImageBlockTypes: [],
extractImageBlockToPayload: false,
extractImageBlockOutputDir: void 0,
extractForms: true,
extractFormFields: true,
extractLinks: true,
languages: void 0,
detectLanguagePerElement: false,
includeCoordinates: false,
coordinateSystem: void 0,
includePageBreaks: true,
maintainHierarchy: true,
strategy: "auto" /* AUTO */,
chunkingStrategy: "none" /* NONE */,
maxCharacters: void 0,
newAfterNChars: void 0,
combineTextUnderNChars: void 0,
includeOriginalHtml: false,
includeMetadata: true,
metadataFilename: void 0,
uniqueElementIds: false,
processAttachments: false,
attachmentPartitioningStrategy: "auto" /* AUTO */,
elementTypeFilters: void 0,
contentFilters: void 0,
includeDebugMetadata: false,
detectionOrigin: void 0,
...options
};
this.cleaner = new DOMCleaner(this.options);
this.classifier = new ElementClassifier();
this.contentHandlers = new ContentHandlers();
}
/**
* Partition HTML content into structured elements
*/
partition(html) {
const startTime = performance.now();
const warnings = [];
try {
const $ = cheerio__namespace.load(html, {
xmlMode: false
});
this.cleaner.clean($);
const elements = this.extractElements($);
const processingTime = Math.max(1, Math.round(performance.now() - startTime));
const elementTypeCounts = {};
let totalTextLength = 0;
let tablesExtracted = 0;
let imagesExtracted = 0;
let formsExtracted = 0;
let linksExtracted = 0;
elements.forEach((element) => {
elementTypeCounts[element.type] = (elementTypeCounts[element.type] || 0) + 1;
totalTextLength += element.text.length;
if (element.type === "Table" /* TABLE */) tablesExtracted++;
if (["Image" /* IMAGE */, "Picture" /* PICTURE */, "Figure" /* FIGURE */].includes(element.type)) imagesExtracted++;
if (element.type === "Form" /* FORM */) formsExtracted++;
if (element.type === "Link" /* LINK */) linksExtracted++;
});
return {
elements,
metadata: {
totalElements: elements.length,
processingTime,
warnings: warnings.length > 0 ? warnings : void 0,
errors: void 0,
elementTypeCounts,
averageElementLength: elements.length > 0 ? Math.round(totalTextLength / elements.length) : 0,
tablesExtracted,
imagesExtracted,
formsExtracted,
linksExtracted,
detectedLanguages: this.options.languages
}
};
} catch (error) {
warnings.push(`Partitioning error: ${error instanceof Error ? error.message : "Unknown error"}`);
return {
elements: [],
metadata: {
totalElements: 0,
processingTime: Math.max(1, Math.round(performance.now() - startTime)),
warnings
}
};
}
}
/**
* Extract structured elements from cleaned DOM
*/
extractElements($) {
const elements = [];
const processedElements = /* @__PURE__ */ new Set();
if ($("body").length > 0) {
$("body").find("*").each((_, element) => {
this.processElement($, element, processedElements, elements);
});
} else {
$("*").each((_, element) => {
this.processElement($, element, processedElements, elements);
});
}
return elements;
}
processElement($, element, processedElements, elements) {
if (processedElements.has(element)) {
return;
}
const $el = $(element);
const elementType = this.classifier.classifyElement($el);
if (elementType === "UncategorizedText" /* UNCATEGORIZED_TEXT */) {
processedElements.add(element);
return;
}
let extractedElement = null;
switch (elementType) {
case "Table" /* TABLE */:
extractedElement = this.extractTable($, $el);
if (extractedElement) {
$el.find("*").each((_, child) => {
processedElements.add(child);
});
}
break;
case "Image" /* IMAGE */:
case "Picture" /* PICTURE */:
case "Figure" /* FIGURE */:
extractedElement = this.extractImage($, $el);
break;
case "Form" /* FORM */:
if (this.options.extractForms) {
extractedElement = this.extractForm($, $el);
}
break;
case "CheckBoxChecked" /* CHECK_BOX_CHECKED */:
case "CheckBoxUnchecked" /* CHECK_BOX_UNCHECKED */:
extractedElement = this.extractCheckBox($, $el);
break;
case "RadioButtonChecked" /* RADIO_BUTTON_CHECKED */:
case "RadioButtonUnchecked" /* RADIO_BUTTON_UNCHECKED */:
extractedElement = this.extractRadioButton($, $el);
break;
case "Value" /* VALUE */:
extractedElement = this.extractValue($, $el);
break;
case "Link" /* LINK */:
if (this.options.extractLinks) {
extractedElement = this.extractLink($, $el);
}
break;
// ADDRESS and EMAIL_ADDRESS cases removed - caused false positives
case "CodeSnippet" /* CODE_SNIPPET */:
extractedElement = this.extractCode($, $el);
break;
case "Formula" /* FORMULA */:
extractedElement = this.extractFormula($, $el);
break;
case "PageBreak" /* PAGE_BREAK */:
if (this.options.includePageBreaks) {
extractedElement = this.extractPageBreak($, $el);
}
break;
default:
extractedElement = this.extractTextElement($, $el, elementType);
break;
}
processedElements.add(element);
if (extractedElement) {
elements.push(extractedElement);
}
}
/**
* Extract a text-based element
*/
extractTextElement($, $el, elementType) {
const text = this.classifier.extractCleanText($el);
if (text.length < this.options.minTextLength) {
return null;
}
if (this.options.maxTextLength && text.length > this.options.maxTextLength) {
return null;
}
const metadata = this.extractMetadata($, $el);
return {
id: uuid.v4(),
type: elementType,
text,
metadata
};
}
/**
* Extract table element with structure
*/
extractTable($, $el) {
if (!this.options.extractTables) {
return null;
}
if (this.isLayoutTable($, $el)) {
return null;
}
let rows = [];
let headers;
const $thead = $el.find("thead tr").first();
if ($thead.length > 0) {
headers = $thead.find("th, td").map((_, cell) => {
return $(cell).text().trim();
}).get();
} else {
const $firstRow = $el.find("tr").first();
if ($firstRow.length > 0 && $firstRow.find("th").length > 0) {
headers = $firstRow.find("th, td").map((_, cell) => {
return $(cell).text().trim();
}).get();
}
}
const $rows = $el.find("tbody tr, tr").filter((_, row) => {
const isInThead = $(row).closest("thead").length > 0;
const hasThElements = $(row).find("th").length > 0;
return !headers || !isInThead && !hasThElements;
});
$rows.each((_, row) => {
const $row = $(row);
const cells = $row.find("td, th").map((_2, cell) => {
return $(cell).text().trim();
}).get();
if (cells.length > 0) {
rows.push(cells);
}
});
if (!headers && rows.length > 1) {
const firstRow = rows[0];
const secondRow = rows[1];
const firstRowHasAlpha = firstRow.some((cell) => /[a-zA-Z]/.test(cell));
const secondRowHasNumbers = secondRow.some((cell) => /\d/.test(cell));
if (firstRowHasAlpha && secondRowHasNumbers) {
headers = firstRow;
rows.shift();
}
}
if (headers && headers.length > 0) {
const normalizedRows = rows.map((row) => {
const normalizedRow = [...row];
while (normalizedRow.length < headers.length) {
normalizedRow.push("");
}
return normalizedRow.slice(0, headers.length);
});
rows = normalizedRows;
}
const text = this.generateTableText(rows, headers);
const metadata = this.extractMetadata($, $el);
return {
id: uuid.v4(),
type: "Table" /* TABLE */,
text,
metadata,
rows,
headers: headers || []
};
}
/**
* Detect if a table is used for layout rather than data
*
* Note: False positives (data tables classified as layout) lose tabular structure
* but preserve all content as individual elements. False negatives (layout tables
* treated as data) cause massive duplication and unusable output.
*/
isLayoutTable($, $el) {
const hasHeaders = $el.find("th").length > 0 || $el.find("thead").length > 0;
const hasComplexLayout = $el.html()?.includes("colspan") || $el.html()?.includes("rowspan");
const hasLayoutAttributes = $el.attr("cellpadding") || $el.attr("cellspacing") || $el.attr("border");
const rows = $el.find("tr");
if (rows.length === 0) return true;
const cellCounts = [];
rows.each((_, row) => {
const cellCount = $(row).find("td, th").length;
cellCounts.push(cellCount);
});
const uniqueCellCounts = [...new Set(cellCounts)];
const hasInconsistentColumns = uniqueCellCounts.length > 3;
const totalCells = $el.find("td, th").length;
const cellsWithLinks = $el.find("td a, th a").length;
const cellsWithImages = $el.find("td img, th img").length;
const cellsWithForms = $el.find("td form, th form, td input, th input").length;
const nonTabularContent = cellsWithLinks + cellsWithImages + cellsWithForms;
const hasHighNonTabularRatio = totalCells > 0 && nonTabularContent / totalCells > 0.3;
if (hasLayoutAttributes && !hasHeaders && hasInconsistentColumns) {
return true;
}
if (!hasHeaders && hasComplexLayout && hasHighNonTabularRatio) {
return true;
}
if (!hasHeaders && hasInconsistentColumns && hasHighNonTabularRatio && rows.length > 20) {
return true;
}
return false;
}
/**
* Extract image element
*/
extractImage($, $el) {
if (!this.options.extractImages) {
return null;
}
const src = $el.attr("src");
const alt = $el.attr("alt") || "";
const width = parseInt($el.attr("width") || "0") || void 0;
const height = parseInt($el.attr("height") || "0") || void 0;
const text = this.options.includeImageAlt ? alt : "";
const metadata = this.extractMetadata($, $el);
return {
id: uuid.v4(),
type: "Image" /* IMAGE */,
text,
metadata,
src,
alt,
width,
height
};
}
/**
* Extract metadata from DOM element
*/
extractMetadata($, $el) {
const tagName = $el.prop("tagName")?.toLowerCase();
const classAttr = $el.attr("class");
const cssClasses = classAttr ? classAttr.split(/\s+/).filter(Boolean) : void 0;
const elementId = $el.attr("id");
const text = $el.text();
const metadata = {
tagName,
cssClasses: cssClasses && cssClasses.length > 0 ? cssClasses : void 0,
elementId: elementId && elementId.trim() !== "" ? elementId : void 0,
textLength: text.length
};
if (this.options.extractLinks) {
const links = this.contentHandlers.extractLinks($, $el);
if (links.length > 0) {
metadata.links = links;
metadata.linkTexts = links.map((link) => link.text);
metadata.linkUrls = links.map((link) => link.url);
}
}
const emphasis = this.contentHandlers.extractEmphasis($, $el);
if (emphasis.contents.length > 0) {
metadata.emphasizedTextContents = emphasis.contents;
metadata.emphasizedTextTags = emphasis.tags;
}
if (this.options.includeCoordinates) {
const coordinates = this.contentHandlers.extractCoordinates($el);
if (coordinates) {
metadata.coordinates = coordinates;
}
}
if (this.options.includeOriginalHtml) {
metadata.originalHtml = $.html($el) || void 0;
}
return metadata;
}
/**
* Extract form element with fields
*/
extractForm($, $el) {
const fields = this.options.extractFormFields ? this.contentHandlers.extractFormFields($, $el) : [];
const text = fields.length > 0 ? fields.map((f) => `${f.fieldName}: ${f.fieldValue}`).join("; ") : $el.text().trim() || "Form";
const metadata = this.extractMetadata($, $el);
return {
id: uuid.v4(),
type: "Form" /* FORM */,
text,
metadata,
fields: this.options.extractFormFields ? fields : void 0
};
}
/**
* Extract checkbox element
*/
extractCheckBox($, $el) {
const checked = Boolean($el.prop("checked"));
const value = $el.attr("value") || "";
const label = $el.prev("label").text().trim() || $el.next("label").text().trim() || "";
const text = label || (checked ? "checked" : "unchecked");
const metadata = this.extractMetadata($, $el);
return {
id: uuid.v4(),
type: checked ? "CheckBoxChecked" /* CHECK_BOX_CHECKED */ : "CheckBoxUnchecked" /* CHECK_BOX_UNCHECKED */,
text,
metadata,
checked,
value: value || void 0
};
}
/**
* Extract radio button element
*/
extractRadioButton($, $el) {
const checked = Boolean($el.prop("checked"));
const value