html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
413 lines (412 loc) • 17.3 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.HtmlProcessor = void 0;
const html_filter_1 = require("./html-filter");
const markdown_generator_1 = require("./markdown-generator");
const types_1 = require("./types");
const presets_1 = require("./presets");
const plugin_manager_1 = require("./plugin-manager");
const page_type_detector_1 = require("./page-type-detector");
/**
* Main HTML processor class with fluent API
*/
class HtmlProcessor {
/**
* Create a new HtmlProcessor instance
* @param options Processing options
*/
constructor(options = {}) {
this.processed = false;
this.dom = null;
this.filteredDom = null;
this.filterStats = null;
this.pageTypeResult = null;
this.autoDetectEnabled = false;
this.options = this.resolveOptions(options);
this.htmlFilter = this.createHtmlFilter();
this.markdownGenerator = this.createMarkdownGenerator();
this.currentHtml = '';
this.baseUrl = this.options.baseUrl || '';
}
/**
* Static factory method to create processor from HTML
* @param html HTML content to process
* @param options Processing options
* @returns New HtmlProcessor instance
*/
static from(html, options = {}) {
const processor = new HtmlProcessor(options);
processor.currentHtml = html || '';
return processor;
}
/**
* Set the base URL for resolving relative links
* @param url Base URL
* @returns This processor instance for chaining
*/
withBaseUrl(url) {
this.baseUrl = url;
this.options.baseUrl = url;
return this;
}
/**
* Update processor options
* @param options New options to merge
* @returns This processor instance for chaining
*/
withOptions(options) {
this.options = Object.assign(Object.assign(Object.assign({}, this.options), options), { filter: Object.assign(Object.assign({}, this.options.filter), options.filter), converter: Object.assign(Object.assign({}, this.options.converter), options.converter) });
// Recreate filter and generator with new options
this.htmlFilter = this.createHtmlFilter();
this.markdownGenerator = this.createMarkdownGenerator();
this.processed = false;
return this;
}
/**
* Apply HTML filtering
* @param options Filter options (optional)
* @returns This processor instance for chaining
*/
filter(options) {
var _a;
return __awaiter(this, void 0, void 0, function* () {
try {
const startTime = Date.now();
// Apply auto-detection if enabled
if (this.autoDetectEnabled && !this.pageTypeResult) {
yield this.detectPageType();
}
// Merge filter options with auto-detected ones
const mergedOptions = Object.assign(Object.assign(Object.assign({}, this.options.filter), (((_a = this.pageTypeResult) === null || _a === void 0 ? void 0 : _a.filterOptions) || {})), options);
// Update options with merged options
this.options.filter = mergedOptions;
// Create or update HTML filter
this.htmlFilter = this.createHtmlFilter();
// Apply filter plugins
const pluginContext = {
options: this.options,
baseUrl: this.baseUrl,
originalHtml: this.currentHtml,
metadata: {}
};
let htmlToFilter = plugin_manager_1.pluginRegistry.applyFilterPlugins(this.currentHtml, pluginContext);
// Apply HTML filtering
const filteredContent = yield this.htmlFilter.filterContentAsString(htmlToFilter);
this.currentHtml = filteredContent || this.currentHtml;
this.processed = true;
const processingTime = Date.now() - startTime;
if (this.options.debug) {
console.log(`[HtmlProcessor] Filtering completed in ${processingTime}ms`);
}
return this;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
throw new types_1.FilterError(`HTML filtering failed: ${errorMessage}`, error instanceof Error ? error : undefined);
}
});
}
/**
* Convert to Markdown with optional custom options
* @param options Converter options (optional)
* @returns Markdown result
*/
toMarkdown(options) {
return __awaiter(this, void 0, void 0, function* () {
try {
const startTime = Date.now();
// Merge converter options if provided
const converterOptions = Object.assign(Object.assign({}, this.options.converter), options);
// Create markdown generator options
const mdOptions = {
ignoreLinks: converterOptions.ignoreLinks,
ignoreImages: converterOptions.ignoreImages,
escapeSnob: converterOptions.escapeSpecialChars
};
// Generate markdown
const result = yield this.markdownGenerator.generateMarkdown(this.currentHtml, this.baseUrl, mdOptions, null, converterOptions.citations !== false);
// Apply plugins to markdown content
const pluginContext = {
options: this.options,
baseUrl: this.baseUrl,
originalHtml: this.currentHtml,
metadata: {}
};
let finalContent = plugin_manager_1.pluginRegistry.applyConvertPlugins(result.rawMarkdown, pluginContext);
const processingTime = Date.now() - startTime;
// Create metadata
const metadata = {
wordCount: this.countWords(finalContent),
linkCount: this.countMatches(finalContent, /\[([^\]]+)\]\([^)]+\)/g),
imageCount: this.countMatches(finalContent, /!\[([^\]]*)\]\([^)]+\)/g),
headingCount: this.countMatches(finalContent, /^#+\s/gm),
processingTime,
sourceLength: this.currentHtml.length
};
return {
content: finalContent,
contentWithCitations: result.markdownWithCitations,
references: result.referencesMarkdown,
metadata
};
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
throw new types_1.ConversionError(`Markdown conversion failed: ${errorMessage}`, error instanceof Error ? error : undefined);
}
});
}
/**
* Convert to plain text
* @returns Plain text content
*/
toText() {
return __awaiter(this, void 0, void 0, function* () {
const markdown = yield this.toMarkdown({ ignoreLinks: true, ignoreImages: true });
return markdown.content
.replace(/^#+\s*/gm, '') // Remove headers
.replace(/\*\*(.*?)\*\*/g, '$1') // Remove bold
.replace(/\*(.*?)\*/g, '$1') // Remove italic
.replace(/`(.*?)`/g, '$1') // Remove code formatting
.replace(/\n{2,}/g, '\n\n') // Normalize line breaks
.trim();
});
}
/**
* Convert to array of HTML fragments
* @returns Array of HTML fragments
*/
toArray() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.currentHtml) {
return [];
}
try {
const fragments = yield this.htmlFilter.filterContent(this.currentHtml);
return fragments && fragments.length > 0 ? fragments : [this.currentHtml];
}
catch (error) {
console.warn('[HtmlProcessor] Failed to convert to array:', error);
return [this.currentHtml];
}
});
}
/**
* Get filtered HTML as string
* @returns Filtered HTML string
*/
toString() {
return this.currentHtml;
}
/**
* Get clean HTML (alias for toString)
* @returns Clean HTML string
*/
toClean() {
return this.toString();
}
/**
* Get detailed filter result with metadata
* @returns Filter result with metadata
*/
getFilterResult() {
return __awaiter(this, void 0, void 0, function* () {
const originalHtml = this.currentHtml; // This is after filtering, need to track original
const startTime = Date.now();
try {
const fragments = yield this.toArray();
const processingTime = Date.now() - startTime;
const metadata = {
originalLength: originalHtml.length,
filteredLength: this.currentHtml.length,
reductionPercent: originalHtml.length > 0
? Math.round((1 - this.currentHtml.length / originalHtml.length) * 100)
: 0,
elementsRemoved: this.countElements(originalHtml) - this.countElements(this.currentHtml),
processingTime
};
return {
content: this.currentHtml,
fragments,
original: originalHtml,
metadata
};
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
throw new types_1.FilterError(`Failed to generate filter result: ${errorMessage}`, error instanceof Error ? error : undefined);
}
});
}
/**
* Get current processing options
* @returns Current options
*/
getOptions() {
return JSON.parse(JSON.stringify(this.options)); // Deep clone
}
/**
* Check if content has been processed
* @returns True if content has been filtered
*/
isProcessed() {
return this.processed;
}
/**
* Get current HTML content
* @returns Current HTML content
*/
getHtml() {
return this.currentHtml;
}
/**
* Get current base URL
* @returns Current base URL
*/
getBaseUrl() {
return this.baseUrl;
}
/**
* Resolve processing options with presets
* @param options Input options
* @returns Resolved options
*/
resolveOptions(options) {
if (options.preset) {
return (0, presets_1.mergeWithPreset)(options.preset, options);
}
// Apply default values
return {
filter: Object.assign({ threshold: 2, strategy: 'dynamic', ratio: 0.48, minWords: 0, preserveStructure: false }, options.filter),
converter: Object.assign({ citations: true, ignoreLinks: false, ignoreImages: false, format: 'github', linkStyle: 'inline', escapeSpecialChars: false }, options.converter),
baseUrl: options.baseUrl || '',
preset: options.preset
};
}
/**
* Create HTML filter instance based on current options
* @returns HtmlFilter instance
*/
createHtmlFilter() {
const filterOpts = this.options.filter;
return new html_filter_1.HtmlFilter(filterOpts.threshold || 2, filterOpts.strategy || 'dynamic', filterOpts.ratio || 0.48);
}
/**
* Create markdown generator instance based on current options
* @returns DefaultMarkdownGenerator instance
*/
createMarkdownGenerator() {
const converterOpts = this.options.converter;
const mdOptions = {
ignoreLinks: converterOpts.ignoreLinks || false,
ignoreImages: converterOpts.ignoreImages || false,
escapeSnob: converterOpts.escapeSpecialChars || false
};
return new markdown_generator_1.DefaultMarkdownGenerator(this.htmlFilter, mdOptions);
}
/**
* Count words in text
* @param text Text to count
* @returns Word count
*/
countWords(text) {
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
}
/**
* Count regex matches in text
* @param text Text to search
* @param regex Regular expression
* @returns Match count
*/
countMatches(text, regex) {
const matches = text.match(regex);
return matches ? matches.length : 0;
}
/**
* Count HTML elements in content
* @param html HTML content
* @returns Element count
*/
countElements(html) {
return this.countMatches(html, /<[^>]+>/g);
}
/**
* Enable page type auto-detection with optional URL hint
* @param url Optional URL for better detection accuracy
* @returns This processor instance for chaining
*/
withAutoDetection(url) {
return __awaiter(this, void 0, void 0, function* () {
this.autoDetectEnabled = true;
// Detect page type immediately with current HTML and URL
const pageTypeDetector = new page_type_detector_1.PageTypeDetector();
this.pageTypeResult = yield pageTypeDetector.detectPageType(this.currentHtml, url || this.baseUrl);
if (this.options.debug && this.pageTypeResult) {
console.log(`[HtmlProcessor] Auto-detected page type: ${this.pageTypeResult.type} (confidence: ${(this.pageTypeResult.confidence * 100).toFixed(1)}%)`);
console.log(`[HtmlProcessor] Detection reasons:`, this.pageTypeResult.reasons);
}
// Update filter options with detected options
this.options.filter = Object.assign(Object.assign({}, this.options.filter), this.pageTypeResult.filterOptions);
return this;
});
}
/**
* Get page type detection result
* @returns Page type detection result or null if not detected
*/
getPageTypeResult() {
return this.pageTypeResult;
}
/**
* Check if auto-detection is enabled
* @returns True if auto-detection is enabled
*/
isAutoDetectionEnabled() {
return this.autoDetectEnabled;
}
/**
* Manually set page type (disables auto-detection)
* @param pageType Page type to set
* @returns This processor instance for chaining
*/
withPageType(pageType) {
return __awaiter(this, void 0, void 0, function* () {
this.autoDetectEnabled = false;
const pageTypeDetector = new page_type_detector_1.PageTypeDetector();
this.pageTypeResult = yield pageTypeDetector.detectPageType('');
this.pageTypeResult.type = pageType;
this.pageTypeResult.confidence = 1.0;
this.pageTypeResult.reasons = [`Manually set to ${pageType}`];
// Get filter options for this page type
const filterOptions = pageTypeDetector.getFilterOptionsForType(pageType, this.pageTypeResult.characteristics);
this.pageTypeResult.filterOptions = filterOptions;
// Update processor options
this.options.filter = Object.assign(Object.assign({}, this.options.filter), filterOptions);
if (this.options.debug) {
console.log(`[HtmlProcessor] Page type manually set to: ${pageType}`);
}
return this;
});
}
/**
* Internal method to detect page type
*/
detectPageType() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.pageTypeResult) {
const pageTypeDetector = new page_type_detector_1.PageTypeDetector();
this.pageTypeResult = yield pageTypeDetector.detectPageType(this.currentHtml, this.baseUrl);
}
});
}
}
exports.HtmlProcessor = HtmlProcessor;