UNPKG

magnitude-extract

Version:

TypeScript DOM cleaning and structuring library

612 lines (596 loc) 16.7 kB
import * as cheerio from 'cheerio'; /** * Core types for the unstructured-ts library * Comprehensive element system matching Unstructured Python library capabilities */ declare enum ElementType { TITLE = "Title", NARRATIVE_TEXT = "NarrativeText", TEXT = "Text", UNCATEGORIZED_TEXT = "UncategorizedText", BULLETED_TEXT = "BulletedText", PARAGRAPH = "Paragraph", ABSTRACT = "Abstract", CAPTION = "Caption", FIGURE_CAPTION = "FigureCaption", FOOTNOTE = "Footnote", PAGE_NUMBER = "PageNumber", LIST = "List", LIST_ITEM = "ListItem", HEADER = "Header", FOOTER = "Footer", PAGE_HEADER = "PageHeader", PAGE_FOOTER = "PageFooter", SECTION_HEADER = "SectionHeader", HEADLINE = "Headline", SUB_HEADLINE = "Subheadline", PAGE_BREAK = "PageBreak", IMAGE = "Image", PICTURE = "Picture", FIGURE = "Figure", TABLE = "Table", FORM = "Form", FIELD_NAME = "FieldName", VALUE = "Value", FORM_KEYS_VALUES = "FormKeysValues", CHECK_BOX_CHECKED = "CheckBoxChecked", CHECK_BOX_UNCHECKED = "CheckBoxUnchecked", RADIO_BUTTON_CHECKED = "RadioButtonChecked", RADIO_BUTTON_UNCHECKED = "RadioButtonUnchecked", CODE_SNIPPET = "CodeSnippet", FORMULA = "Formula", LINK = "Link", NAVIGATION = "Navigation", COMPOSITE_ELEMENT = "CompositeElement", DOCUMENT_DATA = "DocumentData" } interface Point { x: number; y: number; } interface CoordinateSystem { width: number; height: number; coordinateUnit: 'pixels' | 'points' | 'inches'; } interface CoordinatesMetadata { points?: Point[]; system?: CoordinateSystem; layoutWidth?: number; layoutHeight?: number; } interface DataSourceMetadata { url?: string; version?: string; recordLocator?: Record<string, any>; dateCreated?: string; dateModified?: string; dateProcessed?: string; permissionsData?: Array<Record<string, any>>; } interface Link { text: string; url: string; startIndex?: number; } interface FormField { fieldName: string; fieldValue: string; fieldType?: string; } interface ElementMetadata { filename?: string; filetype?: string; fileDirectory?: string; lastModified?: string; pageNumber?: number; pageName?: string; coordinates?: CoordinatesMetadata; parentId?: string; categoryDepth?: number; tagName?: string; cssClasses?: string[]; elementId?: string; textLength?: number; detectionClassProb?: number; isContinuation?: boolean; linkTexts?: string[]; linkUrls?: string[]; linkStartIndexes?: number[]; links?: Link[]; emphasizedTextContents?: string[]; emphasizedTextTags?: string[]; emailMessageId?: string; sentFrom?: string[]; sentTo?: string[]; ccRecipient?: string[]; bccRecipient?: string[]; subject?: string; signature?: string; keyValuePairs?: FormField[]; imageBase64?: string; imageMimeType?: string; imageUrl?: string; imagePath?: string; textAsHtml?: string; tableAsCells?: Record<string, string | number>; dataSource?: DataSourceMetadata; languages?: string[]; headerFooterType?: string; origElements?: Element[]; detectionOrigin?: string; url?: string; attachedToFilename?: string; originalHtml?: string; } interface Element { id: string; type: ElementType; text: string; metadata: ElementMetadata; } interface TableElement extends Element { type: ElementType.TABLE; rows: string[][]; headers?: string[]; } interface ImageElement extends Element { type: ElementType.IMAGE; src?: string; alt?: string; width?: number; height?: number; } interface FormElement extends Element { type: ElementType.FORM | ElementType.FORM_KEYS_VALUES; fields?: FormField[]; } interface CheckBoxElement extends Element { type: ElementType.CHECK_BOX_CHECKED | ElementType.CHECK_BOX_UNCHECKED; checked: boolean; value?: string; } interface RadioButtonElement extends Element { type: ElementType.RADIO_BUTTON_CHECKED | ElementType.RADIO_BUTTON_UNCHECKED; checked: boolean; value?: string; groupName?: string; } interface LinkElement extends Element { type: ElementType.LINK; url: string; linkText: string; } interface CodeElement extends Element { type: ElementType.CODE_SNIPPET; language?: string; codeBlock: string; } interface FormulaElement extends Element { type: ElementType.FORMULA; formula: string; formulaType?: 'latex' | 'mathml' | 'text'; } interface CompositeElement extends Element { type: ElementType.COMPOSITE_ELEMENT; elements: Element[]; } declare enum ProcessingStrategy { AUTO = "auto", FAST = "fast", ACCURATE = "accurate", OCR_ONLY = "ocr_only" } declare enum ChunkingStrategy { NONE = "none", BASIC = "basic", BY_TITLE = "by_title", BY_PAGE = "by_page", BY_SIMILARITY = "by_similarity" } interface PartitionOptions { skipNavigation?: boolean; skipHeaders?: boolean; skipFooters?: boolean; skipForms?: boolean; skipHeadersAndFooters?: boolean; minTextLength?: number; maxTextLength?: number; preserveWhitespace?: boolean; extractTables?: boolean; inferTableStructure?: boolean; skipInferTableTypes?: string[]; extractImages?: boolean; includeImageAlt?: boolean; extractImageBlockTypes?: string[]; extractImageBlockToPayload?: boolean; extractImageBlockOutputDir?: string; extractForms?: boolean; extractFormFields?: boolean; extractLinks?: boolean; languages?: string[]; detectLanguagePerElement?: boolean; includeCoordinates?: boolean; coordinateSystem?: CoordinateSystem; includePageBreaks?: boolean; maintainHierarchy?: boolean; strategy?: ProcessingStrategy; chunkingStrategy?: ChunkingStrategy; maxCharacters?: number; newAfterNChars?: number; combineTextUnderNChars?: number; includeOriginalHtml?: boolean; includeMetadata?: boolean; metadataFilename?: string; uniqueElementIds?: boolean; processAttachments?: boolean; attachmentPartitioningStrategy?: ProcessingStrategy; elementTypeFilters?: ElementType[]; contentFilters?: { minWords?: number; maxWords?: number; excludePatterns?: RegExp[]; includePatterns?: RegExp[]; }; includeDebugMetadata?: boolean; detectionOrigin?: string; } interface PartitionResult { elements: Element[]; metadata: { totalElements: number; processingTime?: number; warnings?: string[]; errors?: string[]; filename?: string; filetype?: string; pageCount?: number; elementTypeCounts?: Record<ElementType, number>; averageElementLength?: number; detectedLanguages?: string[]; tablesExtracted?: number; imagesExtracted?: number; formsExtracted?: number; linksExtracted?: number; memoryUsage?: number; dataSource?: DataSourceMetadata; }; } type AnyElement = Element | TableElement | ImageElement | FormElement | CheckBoxElement | RadioButtonElement | LinkElement | CodeElement | FormulaElement | CompositeElement; /** * Advanced DOM partitioner with comprehensive element extraction * Matching Unstructured Python library capabilities */ declare class DOMPartitioner { private cleaner; private classifier; private contentHandlers; private options; private elementIdMap; constructor(options?: PartitionOptions); /** * Partition HTML content into structured elements */ partition(html: string): PartitionResult; /** * Extract structured elements from cleaned DOM */ private extractElements; private processElement; /** * Extract a text-based element */ private extractTextElement; /** * Extract table element with structure */ private extractTable; /** * Detect if a table is used for layout rather than data * * Note: False positives (data tables classified as layout) lose tabular structure * but preserve all content as individual elements. False negatives (layout tables * treated as data) cause massive duplication and unusable output. */ private isLayoutTable; /** * Extract image element */ private extractImage; /** * Extract metadata from DOM element */ private extractMetadata; /** * Extract form element with fields */ private extractForm; /** * Extract checkbox element */ private extractCheckBox; /** * Extract radio button element */ private extractRadioButton; /** * Extract value element (form inputs, buttons, etc.) */ private extractValue; /** * Extract link element */ private extractLink; /** * Extract code element */ private extractCode; /** * Extract formula element */ private extractFormula; /** * Extract page break element */ private extractPageBreak; /** * Generate readable text representation of table */ private generateTableText; } /** * DOM cleaning utilities * Removes unwanted elements and normalizes HTML structure */ declare class DOMCleaner { private options; constructor(options?: PartitionOptions); /** * Clean and normalize HTML content */ clean($: cheerio.CheerioAPI): cheerio.CheerioAPI; /** * Remove script, style, and other ignored tags */ private removeIgnoredTags; /** * Remove navigation elements */ private removeNavigationElements; /** * Remove header elements */ private removeHeaders; /** * Remove footer elements */ private removeFooters; /** * Remove form elements */ private removeForms; /** * Normalize whitespace in text content */ private normalizeWhitespace; /** * Remove empty elements that don't contribute content */ private removeEmptyElements; /** * Check if element has children that should be preserved */ private hasSignificantChildren; } /** * Advanced element classification logic * Comprehensive semantic analysis matching Unstructured Python library capabilities */ declare class ElementClassifier { /** * Classify a DOM element based on its tag, attributes, and content */ classifyElement($el: any): ElementType; /** * Classify specialized elements (forms, addresses, emails, code, etc.) */ private classifySpecializedElement; /** * Check if element is a form-related element */ private isFormElement; /** * Classify form elements into specific types */ private classifyFormElement; /** * Check if element contains code */ private isCodeElement; /** * Check if element contains an address */ private isAddressElement; /** * Check if element contains an email address */ private isEmailAddressElement; /** * Check if element contains a mathematical formula */ private isFormulaElement; /** * Check if element is a caption */ private isCaptionElement; /** * Classify caption elements */ private classifyCaptionElement; /** * Check if element is a footnote */ private isFootnoteElement; /** * Check if element contains a page number */ private isPageNumberElement; /** * Check if element is an abstract */ private isAbstractElement; /** * Classify header and footer elements */ private classifyHeaderFooter; /** * Check if header is page-level */ private isPageLevelHeader; /** * Check if footer is page-level */ private isPageLevelFooter; /** * Classify element based on CSS classes */ private classifyByCSS; /** * Classify element based on content analysis */ private classifyByContent; /** * Heuristics to determine if text looks like a title/heading */ private looksLikeTitle; /** * Heuristics to determine if text looks like a list item */ private looksLikeListItem; /** * Check if element should be treated as inline (part of parent's text) */ isInlineElement($el: any): boolean; /** * Extract clean text from element, handling inline elements appropriately */ extractCleanText($el: any): string; } /** * Content-specific handlers for advanced processing * Handles specialized content types like forms, addresses, emails, etc. */ declare class ContentHandlers { /** * Extract form fields and their values */ extractFormFields($: any, $form: any): FormField[]; /** * Extract links with metadata */ extractLinks($: any, $el: any): Link[]; /** * Parse address components */ parseAddress(text: string): any; /** * Extract email addresses from text */ extractEmailAddresses(text: string): string[]; /** * Detect programming language in code blocks */ detectCodeLanguage($el: any): string | undefined; /** * Detect programming language from code content */ private detectLanguageFromContent; /** * Extract mathematical formula type */ detectFormulaType($el: any): 'latex' | 'mathml' | 'text'; /** * Extract coordinates from element positioning */ extractCoordinates($el: any): CoordinatesMetadata | undefined; /** * Extract emphasized text and their tags */ extractEmphasis($: any, $el: any): { contents: string[]; tags: string[]; }; /** * Detect if text contains page break indicators */ isPageBreak($el: any): boolean; /** * Extract table structure with headers and data */ extractTableStructure($: any, $table: any): { headers?: string[]; rows: string[][]; }; } /** * Markdown serializer for extracted data elements * Converts structured elements into LLM-friendly markdown format */ interface MarkdownSerializerOptions { includeMetadata?: boolean; includeElementIds?: boolean; includeCoordinates?: boolean; includePageNumbers?: boolean; preserveHierarchy?: boolean; maxTableWidth?: number; escapeSpecialChars?: boolean; includeFormFields?: boolean; includeImageMetadata?: boolean; customElementHandlers?: Partial<Record<ElementType, (element: AnyElement) => string>>; } declare class MarkdownSerializer { private options; constructor(options?: MarkdownSerializerOptions); serialize(result: PartitionResult): string; serializeElements(elements: Element[]): string; private serializeElement; private serializeTitle; private serializeHeader; private serializeHeadline; private serializeSubHeadline; private serializeText; private serializeListItem; private serializeTable; private serializeImage; private serializeForm; private serializeCheckBox; private serializeRadioButton; private serializeLink; private serializeCode; private serializeFormula; private serializeComposite; private serializeCaption; private serializeFootnote; private serializeAbstract; private serializeGeneric; private serializeDocumentMetadata; private getTitleLevel; private getElementSuffix; private escapeText; } declare function serializeToMarkdown(result: PartitionResult, options?: MarkdownSerializerOptions): string; /** * Comprehensive HTML tag and CSS class mappings for element classification * Matching Unstructured Python library capabilities */ declare const TAG_TO_ELEMENT_TYPE: Record<string, ElementType>; declare const CSS_CLASS_PATTERNS: Array<{ pattern: RegExp; elementType: ElementType; }>; declare const IGNORED_TAGS: Set<string>; declare const NAVIGATION_TAGS: Set<string>; declare const METADATA_TAGS: Set<string>; declare const INLINE_TAGS: Set<string>; declare function partitionHtml(html: string, options?: PartitionOptions): PartitionResult; export { CSS_CLASS_PATTERNS, ContentHandlers, DOMCleaner, DOMPartitioner, type Element, ElementClassifier, type ElementMetadata, ElementType, IGNORED_TAGS, INLINE_TAGS, type ImageElement, METADATA_TAGS, MarkdownSerializer, type MarkdownSerializerOptions, NAVIGATION_TAGS, type PartitionOptions, type PartitionResult, TAG_TO_ELEMENT_TYPE, type TableElement, partitionHtml, serializeToMarkdown };