magnitude-extract
Version:
TypeScript DOM cleaning and structuring library
612 lines (596 loc) • 16.7 kB
TypeScript
import * as cheerio from 'cheerio';
/**
* Core types for the unstructured-ts library
* Comprehensive element system matching Unstructured Python library capabilities
*/
declare enum ElementType {
TITLE = "Title",
NARRATIVE_TEXT = "NarrativeText",
TEXT = "Text",
UNCATEGORIZED_TEXT = "UncategorizedText",
BULLETED_TEXT = "BulletedText",
PARAGRAPH = "Paragraph",
ABSTRACT = "Abstract",
CAPTION = "Caption",
FIGURE_CAPTION = "FigureCaption",
FOOTNOTE = "Footnote",
PAGE_NUMBER = "PageNumber",
LIST = "List",
LIST_ITEM = "ListItem",
HEADER = "Header",
FOOTER = "Footer",
PAGE_HEADER = "PageHeader",
PAGE_FOOTER = "PageFooter",
SECTION_HEADER = "SectionHeader",
HEADLINE = "Headline",
SUB_HEADLINE = "Subheadline",
PAGE_BREAK = "PageBreak",
IMAGE = "Image",
PICTURE = "Picture",
FIGURE = "Figure",
TABLE = "Table",
FORM = "Form",
FIELD_NAME = "FieldName",
VALUE = "Value",
FORM_KEYS_VALUES = "FormKeysValues",
CHECK_BOX_CHECKED = "CheckBoxChecked",
CHECK_BOX_UNCHECKED = "CheckBoxUnchecked",
RADIO_BUTTON_CHECKED = "RadioButtonChecked",
RADIO_BUTTON_UNCHECKED = "RadioButtonUnchecked",
CODE_SNIPPET = "CodeSnippet",
FORMULA = "Formula",
LINK = "Link",
NAVIGATION = "Navigation",
COMPOSITE_ELEMENT = "CompositeElement",
DOCUMENT_DATA = "DocumentData"
}
interface Point {
x: number;
y: number;
}
interface CoordinateSystem {
width: number;
height: number;
coordinateUnit: 'pixels' | 'points' | 'inches';
}
interface CoordinatesMetadata {
points?: Point[];
system?: CoordinateSystem;
layoutWidth?: number;
layoutHeight?: number;
}
interface DataSourceMetadata {
url?: string;
version?: string;
recordLocator?: Record<string, any>;
dateCreated?: string;
dateModified?: string;
dateProcessed?: string;
permissionsData?: Array<Record<string, any>>;
}
interface Link {
text: string;
url: string;
startIndex?: number;
}
interface FormField {
fieldName: string;
fieldValue: string;
fieldType?: string;
}
interface ElementMetadata {
filename?: string;
filetype?: string;
fileDirectory?: string;
lastModified?: string;
pageNumber?: number;
pageName?: string;
coordinates?: CoordinatesMetadata;
parentId?: string;
categoryDepth?: number;
tagName?: string;
cssClasses?: string[];
elementId?: string;
textLength?: number;
detectionClassProb?: number;
isContinuation?: boolean;
linkTexts?: string[];
linkUrls?: string[];
linkStartIndexes?: number[];
links?: Link[];
emphasizedTextContents?: string[];
emphasizedTextTags?: string[];
emailMessageId?: string;
sentFrom?: string[];
sentTo?: string[];
ccRecipient?: string[];
bccRecipient?: string[];
subject?: string;
signature?: string;
keyValuePairs?: FormField[];
imageBase64?: string;
imageMimeType?: string;
imageUrl?: string;
imagePath?: string;
textAsHtml?: string;
tableAsCells?: Record<string, string | number>;
dataSource?: DataSourceMetadata;
languages?: string[];
headerFooterType?: string;
origElements?: Element[];
detectionOrigin?: string;
url?: string;
attachedToFilename?: string;
originalHtml?: string;
}
interface Element {
id: string;
type: ElementType;
text: string;
metadata: ElementMetadata;
}
interface TableElement extends Element {
type: ElementType.TABLE;
rows: string[][];
headers?: string[];
}
interface ImageElement extends Element {
type: ElementType.IMAGE;
src?: string;
alt?: string;
width?: number;
height?: number;
}
interface FormElement extends Element {
type: ElementType.FORM | ElementType.FORM_KEYS_VALUES;
fields?: FormField[];
}
interface CheckBoxElement extends Element {
type: ElementType.CHECK_BOX_CHECKED | ElementType.CHECK_BOX_UNCHECKED;
checked: boolean;
value?: string;
}
interface RadioButtonElement extends Element {
type: ElementType.RADIO_BUTTON_CHECKED | ElementType.RADIO_BUTTON_UNCHECKED;
checked: boolean;
value?: string;
groupName?: string;
}
interface LinkElement extends Element {
type: ElementType.LINK;
url: string;
linkText: string;
}
interface CodeElement extends Element {
type: ElementType.CODE_SNIPPET;
language?: string;
codeBlock: string;
}
interface FormulaElement extends Element {
type: ElementType.FORMULA;
formula: string;
formulaType?: 'latex' | 'mathml' | 'text';
}
interface CompositeElement extends Element {
type: ElementType.COMPOSITE_ELEMENT;
elements: Element[];
}
declare enum ProcessingStrategy {
AUTO = "auto",
FAST = "fast",
ACCURATE = "accurate",
OCR_ONLY = "ocr_only"
}
declare enum ChunkingStrategy {
NONE = "none",
BASIC = "basic",
BY_TITLE = "by_title",
BY_PAGE = "by_page",
BY_SIMILARITY = "by_similarity"
}
interface PartitionOptions {
skipNavigation?: boolean;
skipHeaders?: boolean;
skipFooters?: boolean;
skipForms?: boolean;
skipHeadersAndFooters?: boolean;
minTextLength?: number;
maxTextLength?: number;
preserveWhitespace?: boolean;
extractTables?: boolean;
inferTableStructure?: boolean;
skipInferTableTypes?: string[];
extractImages?: boolean;
includeImageAlt?: boolean;
extractImageBlockTypes?: string[];
extractImageBlockToPayload?: boolean;
extractImageBlockOutputDir?: string;
extractForms?: boolean;
extractFormFields?: boolean;
extractLinks?: boolean;
languages?: string[];
detectLanguagePerElement?: boolean;
includeCoordinates?: boolean;
coordinateSystem?: CoordinateSystem;
includePageBreaks?: boolean;
maintainHierarchy?: boolean;
strategy?: ProcessingStrategy;
chunkingStrategy?: ChunkingStrategy;
maxCharacters?: number;
newAfterNChars?: number;
combineTextUnderNChars?: number;
includeOriginalHtml?: boolean;
includeMetadata?: boolean;
metadataFilename?: string;
uniqueElementIds?: boolean;
processAttachments?: boolean;
attachmentPartitioningStrategy?: ProcessingStrategy;
elementTypeFilters?: ElementType[];
contentFilters?: {
minWords?: number;
maxWords?: number;
excludePatterns?: RegExp[];
includePatterns?: RegExp[];
};
includeDebugMetadata?: boolean;
detectionOrigin?: string;
}
interface PartitionResult {
elements: Element[];
metadata: {
totalElements: number;
processingTime?: number;
warnings?: string[];
errors?: string[];
filename?: string;
filetype?: string;
pageCount?: number;
elementTypeCounts?: Record<ElementType, number>;
averageElementLength?: number;
detectedLanguages?: string[];
tablesExtracted?: number;
imagesExtracted?: number;
formsExtracted?: number;
linksExtracted?: number;
memoryUsage?: number;
dataSource?: DataSourceMetadata;
};
}
type AnyElement = Element | TableElement | ImageElement | FormElement | CheckBoxElement | RadioButtonElement | LinkElement | CodeElement | FormulaElement | CompositeElement;
/**
* Advanced DOM partitioner with comprehensive element extraction
* Matching Unstructured Python library capabilities
*/
declare class DOMPartitioner {
private cleaner;
private classifier;
private contentHandlers;
private options;
private elementIdMap;
constructor(options?: PartitionOptions);
/**
* Partition HTML content into structured elements
*/
partition(html: string): PartitionResult;
/**
* Extract structured elements from cleaned DOM
*/
private extractElements;
private processElement;
/**
* Extract a text-based element
*/
private extractTextElement;
/**
* Extract table element with structure
*/
private extractTable;
/**
* Detect if a table is used for layout rather than data
*
* Note: False positives (data tables classified as layout) lose tabular structure
* but preserve all content as individual elements. False negatives (layout tables
* treated as data) cause massive duplication and unusable output.
*/
private isLayoutTable;
/**
* Extract image element
*/
private extractImage;
/**
* Extract metadata from DOM element
*/
private extractMetadata;
/**
* Extract form element with fields
*/
private extractForm;
/**
* Extract checkbox element
*/
private extractCheckBox;
/**
* Extract radio button element
*/
private extractRadioButton;
/**
* Extract value element (form inputs, buttons, etc.)
*/
private extractValue;
/**
* Extract link element
*/
private extractLink;
/**
* Extract code element
*/
private extractCode;
/**
* Extract formula element
*/
private extractFormula;
/**
* Extract page break element
*/
private extractPageBreak;
/**
* Generate readable text representation of table
*/
private generateTableText;
}
/**
* DOM cleaning utilities
* Removes unwanted elements and normalizes HTML structure
*/
declare class DOMCleaner {
private options;
constructor(options?: PartitionOptions);
/**
* Clean and normalize HTML content
*/
clean($: cheerio.CheerioAPI): cheerio.CheerioAPI;
/**
* Remove script, style, and other ignored tags
*/
private removeIgnoredTags;
/**
* Remove navigation elements
*/
private removeNavigationElements;
/**
* Remove header elements
*/
private removeHeaders;
/**
* Remove footer elements
*/
private removeFooters;
/**
* Remove form elements
*/
private removeForms;
/**
* Normalize whitespace in text content
*/
private normalizeWhitespace;
/**
* Remove empty elements that don't contribute content
*/
private removeEmptyElements;
/**
* Check if element has children that should be preserved
*/
private hasSignificantChildren;
}
/**
* Advanced element classification logic
* Comprehensive semantic analysis matching Unstructured Python library capabilities
*/
declare class ElementClassifier {
/**
* Classify a DOM element based on its tag, attributes, and content
*/
classifyElement($el: any): ElementType;
/**
* Classify specialized elements (forms, addresses, emails, code, etc.)
*/
private classifySpecializedElement;
/**
* Check if element is a form-related element
*/
private isFormElement;
/**
* Classify form elements into specific types
*/
private classifyFormElement;
/**
* Check if element contains code
*/
private isCodeElement;
/**
* Check if element contains an address
*/
private isAddressElement;
/**
* Check if element contains an email address
*/
private isEmailAddressElement;
/**
* Check if element contains a mathematical formula
*/
private isFormulaElement;
/**
* Check if element is a caption
*/
private isCaptionElement;
/**
* Classify caption elements
*/
private classifyCaptionElement;
/**
* Check if element is a footnote
*/
private isFootnoteElement;
/**
* Check if element contains a page number
*/
private isPageNumberElement;
/**
* Check if element is an abstract
*/
private isAbstractElement;
/**
* Classify header and footer elements
*/
private classifyHeaderFooter;
/**
* Check if header is page-level
*/
private isPageLevelHeader;
/**
* Check if footer is page-level
*/
private isPageLevelFooter;
/**
* Classify element based on CSS classes
*/
private classifyByCSS;
/**
* Classify element based on content analysis
*/
private classifyByContent;
/**
* Heuristics to determine if text looks like a title/heading
*/
private looksLikeTitle;
/**
* Heuristics to determine if text looks like a list item
*/
private looksLikeListItem;
/**
* Check if element should be treated as inline (part of parent's text)
*/
isInlineElement($el: any): boolean;
/**
* Extract clean text from element, handling inline elements appropriately
*/
extractCleanText($el: any): string;
}
/**
* Content-specific handlers for advanced processing
* Handles specialized content types like forms, addresses, emails, etc.
*/
declare class ContentHandlers {
/**
* Extract form fields and their values
*/
extractFormFields($: any, $form: any): FormField[];
/**
* Extract links with metadata
*/
extractLinks($: any, $el: any): Link[];
/**
* Parse address components
*/
parseAddress(text: string): any;
/**
* Extract email addresses from text
*/
extractEmailAddresses(text: string): string[];
/**
* Detect programming language in code blocks
*/
detectCodeLanguage($el: any): string | undefined;
/**
* Detect programming language from code content
*/
private detectLanguageFromContent;
/**
* Extract mathematical formula type
*/
detectFormulaType($el: any): 'latex' | 'mathml' | 'text';
/**
* Extract coordinates from element positioning
*/
extractCoordinates($el: any): CoordinatesMetadata | undefined;
/**
* Extract emphasized text and their tags
*/
extractEmphasis($: any, $el: any): {
contents: string[];
tags: string[];
};
/**
* Detect if text contains page break indicators
*/
isPageBreak($el: any): boolean;
/**
* Extract table structure with headers and data
*/
extractTableStructure($: any, $table: any): {
headers?: string[];
rows: string[][];
};
}
/**
* Markdown serializer for extracted data elements
* Converts structured elements into LLM-friendly markdown format
*/
interface MarkdownSerializerOptions {
includeMetadata?: boolean;
includeElementIds?: boolean;
includeCoordinates?: boolean;
includePageNumbers?: boolean;
preserveHierarchy?: boolean;
maxTableWidth?: number;
escapeSpecialChars?: boolean;
includeFormFields?: boolean;
includeImageMetadata?: boolean;
customElementHandlers?: Partial<Record<ElementType, (element: AnyElement) => string>>;
}
declare class MarkdownSerializer {
private options;
constructor(options?: MarkdownSerializerOptions);
serialize(result: PartitionResult): string;
serializeElements(elements: Element[]): string;
private serializeElement;
private serializeTitle;
private serializeHeader;
private serializeHeadline;
private serializeSubHeadline;
private serializeText;
private serializeListItem;
private serializeTable;
private serializeImage;
private serializeForm;
private serializeCheckBox;
private serializeRadioButton;
private serializeLink;
private serializeCode;
private serializeFormula;
private serializeComposite;
private serializeCaption;
private serializeFootnote;
private serializeAbstract;
private serializeGeneric;
private serializeDocumentMetadata;
private getTitleLevel;
private getElementSuffix;
private escapeText;
}
declare function serializeToMarkdown(result: PartitionResult, options?: MarkdownSerializerOptions): string;
/**
* Comprehensive HTML tag and CSS class mappings for element classification
* Matching Unstructured Python library capabilities
*/
declare const TAG_TO_ELEMENT_TYPE: Record<string, ElementType>;
declare const CSS_CLASS_PATTERNS: Array<{
pattern: RegExp;
elementType: ElementType;
}>;
declare const IGNORED_TAGS: Set<string>;
declare const NAVIGATION_TAGS: Set<string>;
declare const METADATA_TAGS: Set<string>;
declare const INLINE_TAGS: Set<string>;
declare function partitionHtml(html: string, options?: PartitionOptions): PartitionResult;
export { CSS_CLASS_PATTERNS, ContentHandlers, DOMCleaner, DOMPartitioner, type Element, ElementClassifier, type ElementMetadata, ElementType, IGNORED_TAGS, INLINE_TAGS, type ImageElement, METADATA_TAGS, MarkdownSerializer, type MarkdownSerializerOptions, NAVIGATION_TAGS, type PartitionOptions, type PartitionResult, TAG_TO_ELEMENT_TYPE, type TableElement, partitionHtml, serializeToMarkdown };