UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

413 lines (412 loc) 17.3 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.HtmlProcessor = void 0; const html_filter_1 = require("./html-filter"); const markdown_generator_1 = require("./markdown-generator"); const types_1 = require("./types"); const presets_1 = require("./presets"); const plugin_manager_1 = require("./plugin-manager"); const page_type_detector_1 = require("./page-type-detector"); /** * Main HTML processor class with fluent API */ class HtmlProcessor { /** * Create a new HtmlProcessor instance * @param options Processing options */ constructor(options = {}) { this.processed = false; this.dom = null; this.filteredDom = null; this.filterStats = null; this.pageTypeResult = null; this.autoDetectEnabled = false; this.options = this.resolveOptions(options); this.htmlFilter = this.createHtmlFilter(); this.markdownGenerator = this.createMarkdownGenerator(); this.currentHtml = ''; this.baseUrl = this.options.baseUrl || ''; } /** * Static factory method to create processor from HTML * @param html HTML content to process * @param options Processing options * @returns New HtmlProcessor instance */ static from(html, options = {}) { const processor = new HtmlProcessor(options); processor.currentHtml = html || ''; return processor; } /** * Set the base URL for resolving relative links * @param url Base URL * @returns This processor instance for chaining */ withBaseUrl(url) { this.baseUrl = url; this.options.baseUrl = url; return this; } /** * Update processor options * @param options New options to merge * @returns This processor instance for chaining */ withOptions(options) { this.options = Object.assign(Object.assign(Object.assign({}, this.options), options), { filter: Object.assign(Object.assign({}, this.options.filter), options.filter), converter: Object.assign(Object.assign({}, this.options.converter), options.converter) }); // Recreate filter and generator with new options this.htmlFilter = this.createHtmlFilter(); this.markdownGenerator = this.createMarkdownGenerator(); this.processed = false; return this; } /** * Apply HTML filtering * @param options Filter options (optional) * @returns This processor instance for chaining */ filter(options) { var _a; return __awaiter(this, void 0, void 0, function* () { try { const startTime = Date.now(); // Apply auto-detection if enabled if (this.autoDetectEnabled && !this.pageTypeResult) { yield this.detectPageType(); } // Merge filter options with auto-detected ones const mergedOptions = Object.assign(Object.assign(Object.assign({}, this.options.filter), (((_a = this.pageTypeResult) === null || _a === void 0 ? void 0 : _a.filterOptions) || {})), options); // Update options with merged options this.options.filter = mergedOptions; // Create or update HTML filter this.htmlFilter = this.createHtmlFilter(); // Apply filter plugins const pluginContext = { options: this.options, baseUrl: this.baseUrl, originalHtml: this.currentHtml, metadata: {} }; let htmlToFilter = plugin_manager_1.pluginRegistry.applyFilterPlugins(this.currentHtml, pluginContext); // Apply HTML filtering const filteredContent = yield this.htmlFilter.filterContentAsString(htmlToFilter); this.currentHtml = filteredContent || this.currentHtml; this.processed = true; const processingTime = Date.now() - startTime; if (this.options.debug) { console.log(`[HtmlProcessor] Filtering completed in ${processingTime}ms`); } return this; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); throw new types_1.FilterError(`HTML filtering failed: ${errorMessage}`, error instanceof Error ? error : undefined); } }); } /** * Convert to Markdown with optional custom options * @param options Converter options (optional) * @returns Markdown result */ toMarkdown(options) { return __awaiter(this, void 0, void 0, function* () { try { const startTime = Date.now(); // Merge converter options if provided const converterOptions = Object.assign(Object.assign({}, this.options.converter), options); // Create markdown generator options const mdOptions = { ignoreLinks: converterOptions.ignoreLinks, ignoreImages: converterOptions.ignoreImages, escapeSnob: converterOptions.escapeSpecialChars }; // Generate markdown const result = yield this.markdownGenerator.generateMarkdown(this.currentHtml, this.baseUrl, mdOptions, null, converterOptions.citations !== false); // Apply plugins to markdown content const pluginContext = { options: this.options, baseUrl: this.baseUrl, originalHtml: this.currentHtml, metadata: {} }; let finalContent = plugin_manager_1.pluginRegistry.applyConvertPlugins(result.rawMarkdown, pluginContext); const processingTime = Date.now() - startTime; // Create metadata const metadata = { wordCount: this.countWords(finalContent), linkCount: this.countMatches(finalContent, /\[([^\]]+)\]\([^)]+\)/g), imageCount: this.countMatches(finalContent, /!\[([^\]]*)\]\([^)]+\)/g), headingCount: this.countMatches(finalContent, /^#+\s/gm), processingTime, sourceLength: this.currentHtml.length }; return { content: finalContent, contentWithCitations: result.markdownWithCitations, references: result.referencesMarkdown, metadata }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); throw new types_1.ConversionError(`Markdown conversion failed: ${errorMessage}`, error instanceof Error ? error : undefined); } }); } /** * Convert to plain text * @returns Plain text content */ toText() { return __awaiter(this, void 0, void 0, function* () { const markdown = yield this.toMarkdown({ ignoreLinks: true, ignoreImages: true }); return markdown.content .replace(/^#+\s*/gm, '') // Remove headers .replace(/\*\*(.*?)\*\*/g, '$1') // Remove bold .replace(/\*(.*?)\*/g, '$1') // Remove italic .replace(/`(.*?)`/g, '$1') // Remove code formatting .replace(/\n{2,}/g, '\n\n') // Normalize line breaks .trim(); }); } /** * Convert to array of HTML fragments * @returns Array of HTML fragments */ toArray() { return __awaiter(this, void 0, void 0, function* () { if (!this.currentHtml) { return []; } try { const fragments = yield this.htmlFilter.filterContent(this.currentHtml); return fragments && fragments.length > 0 ? fragments : [this.currentHtml]; } catch (error) { console.warn('[HtmlProcessor] Failed to convert to array:', error); return [this.currentHtml]; } }); } /** * Get filtered HTML as string * @returns Filtered HTML string */ toString() { return this.currentHtml; } /** * Get clean HTML (alias for toString) * @returns Clean HTML string */ toClean() { return this.toString(); } /** * Get detailed filter result with metadata * @returns Filter result with metadata */ getFilterResult() { return __awaiter(this, void 0, void 0, function* () { const originalHtml = this.currentHtml; // This is after filtering, need to track original const startTime = Date.now(); try { const fragments = yield this.toArray(); const processingTime = Date.now() - startTime; const metadata = { originalLength: originalHtml.length, filteredLength: this.currentHtml.length, reductionPercent: originalHtml.length > 0 ? Math.round((1 - this.currentHtml.length / originalHtml.length) * 100) : 0, elementsRemoved: this.countElements(originalHtml) - this.countElements(this.currentHtml), processingTime }; return { content: this.currentHtml, fragments, original: originalHtml, metadata }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); throw new types_1.FilterError(`Failed to generate filter result: ${errorMessage}`, error instanceof Error ? error : undefined); } }); } /** * Get current processing options * @returns Current options */ getOptions() { return JSON.parse(JSON.stringify(this.options)); // Deep clone } /** * Check if content has been processed * @returns True if content has been filtered */ isProcessed() { return this.processed; } /** * Get current HTML content * @returns Current HTML content */ getHtml() { return this.currentHtml; } /** * Get current base URL * @returns Current base URL */ getBaseUrl() { return this.baseUrl; } /** * Resolve processing options with presets * @param options Input options * @returns Resolved options */ resolveOptions(options) { if (options.preset) { return (0, presets_1.mergeWithPreset)(options.preset, options); } // Apply default values return { filter: Object.assign({ threshold: 2, strategy: 'dynamic', ratio: 0.48, minWords: 0, preserveStructure: false }, options.filter), converter: Object.assign({ citations: true, ignoreLinks: false, ignoreImages: false, format: 'github', linkStyle: 'inline', escapeSpecialChars: false }, options.converter), baseUrl: options.baseUrl || '', preset: options.preset }; } /** * Create HTML filter instance based on current options * @returns HtmlFilter instance */ createHtmlFilter() { const filterOpts = this.options.filter; return new html_filter_1.HtmlFilter(filterOpts.threshold || 2, filterOpts.strategy || 'dynamic', filterOpts.ratio || 0.48); } /** * Create markdown generator instance based on current options * @returns DefaultMarkdownGenerator instance */ createMarkdownGenerator() { const converterOpts = this.options.converter; const mdOptions = { ignoreLinks: converterOpts.ignoreLinks || false, ignoreImages: converterOpts.ignoreImages || false, escapeSnob: converterOpts.escapeSpecialChars || false }; return new markdown_generator_1.DefaultMarkdownGenerator(this.htmlFilter, mdOptions); } /** * Count words in text * @param text Text to count * @returns Word count */ countWords(text) { return text.trim().split(/\s+/).filter(word => word.length > 0).length; } /** * Count regex matches in text * @param text Text to search * @param regex Regular expression * @returns Match count */ countMatches(text, regex) { const matches = text.match(regex); return matches ? matches.length : 0; } /** * Count HTML elements in content * @param html HTML content * @returns Element count */ countElements(html) { return this.countMatches(html, /<[^>]+>/g); } /** * Enable page type auto-detection with optional URL hint * @param url Optional URL for better detection accuracy * @returns This processor instance for chaining */ withAutoDetection(url) { return __awaiter(this, void 0, void 0, function* () { this.autoDetectEnabled = true; // Detect page type immediately with current HTML and URL const pageTypeDetector = new page_type_detector_1.PageTypeDetector(); this.pageTypeResult = yield pageTypeDetector.detectPageType(this.currentHtml, url || this.baseUrl); if (this.options.debug && this.pageTypeResult) { console.log(`[HtmlProcessor] Auto-detected page type: ${this.pageTypeResult.type} (confidence: ${(this.pageTypeResult.confidence * 100).toFixed(1)}%)`); console.log(`[HtmlProcessor] Detection reasons:`, this.pageTypeResult.reasons); } // Update filter options with detected options this.options.filter = Object.assign(Object.assign({}, this.options.filter), this.pageTypeResult.filterOptions); return this; }); } /** * Get page type detection result * @returns Page type detection result or null if not detected */ getPageTypeResult() { return this.pageTypeResult; } /** * Check if auto-detection is enabled * @returns True if auto-detection is enabled */ isAutoDetectionEnabled() { return this.autoDetectEnabled; } /** * Manually set page type (disables auto-detection) * @param pageType Page type to set * @returns This processor instance for chaining */ withPageType(pageType) { return __awaiter(this, void 0, void 0, function* () { this.autoDetectEnabled = false; const pageTypeDetector = new page_type_detector_1.PageTypeDetector(); this.pageTypeResult = yield pageTypeDetector.detectPageType(''); this.pageTypeResult.type = pageType; this.pageTypeResult.confidence = 1.0; this.pageTypeResult.reasons = [`Manually set to ${pageType}`]; // Get filter options for this page type const filterOptions = pageTypeDetector.getFilterOptionsForType(pageType, this.pageTypeResult.characteristics); this.pageTypeResult.filterOptions = filterOptions; // Update processor options this.options.filter = Object.assign(Object.assign({}, this.options.filter), filterOptions); if (this.options.debug) { console.log(`[HtmlProcessor] Page type manually set to: ${pageType}`); } return this; }); } /** * Internal method to detect page type */ detectPageType() { return __awaiter(this, void 0, void 0, function* () { if (!this.pageTypeResult) { const pageTypeDetector = new page_type_detector_1.PageTypeDetector(); this.pageTypeResult = yield pageTypeDetector.detectPageType(this.currentHtml, this.baseUrl); } }); } } exports.HtmlProcessor = HtmlProcessor;