UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

182 lines (181 loc) 5.53 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.mergeWithPreset = exports.hasPreset = exports.getPresetNames = exports.getPreset = exports.presets = void 0; /** * Predefined preset configurations for common use cases */ exports.presets = { /** * Default configuration - balanced filtering and conversion */ default: { filter: { threshold: 2, strategy: 'dynamic', ratio: 0.48, minWords: 0, preserveStructure: false }, converter: { citations: true, ignoreLinks: false, ignoreImages: false, format: 'github', linkStyle: 'inline', escapeSpecialChars: false } }, /** * Article configuration - optimized for long-form content */ article: { filter: { threshold: 3, strategy: 'dynamic', ratio: 0.55, minWords: 10, preserveStructure: true, removeElements: ['nav', 'aside', 'footer', '.ads', '.advertisement', '.sidebar'] }, converter: { citations: true, ignoreLinks: false, ignoreImages: false, format: 'github', linkStyle: 'reference', escapeSpecialChars: false } }, /** * Blog configuration - optimized for blog posts */ blog: { filter: { threshold: 2, strategy: 'dynamic', ratio: 0.50, minWords: 5, preserveStructure: true, removeElements: ['nav', 'aside', '.comments', '.social-share', '.ads'] }, converter: { citations: false, ignoreLinks: false, ignoreImages: false, format: 'github', linkStyle: 'inline', escapeSpecialChars: false } }, /** * News configuration - optimized for news articles */ news: { filter: { threshold: 2, strategy: 'fixed', ratio: 0.45, minWords: 15, preserveStructure: true, removeElements: [ 'nav', 'aside', 'footer', '.ads', '.advertisement', '.related-articles', '.social-share', '.comments' ] }, converter: { citations: true, ignoreLinks: false, ignoreImages: false, format: 'commonmark', linkStyle: 'reference', escapeSpecialChars: true } }, /** * Strict configuration - aggressive filtering and clean output */ strict: { filter: { threshold: 4, strategy: 'fixed', ratio: 0.60, minWords: 20, preserveStructure: false, removeElements: [ 'nav', 'aside', 'footer', 'header', '.ads', '.advertisement', '.sidebar', '.comments', '.social-share', '.related', 'script', 'style' ], keepElements: ['article', 'main', 'section', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] }, converter: { citations: false, ignoreLinks: true, ignoreImages: true, format: 'commonmark', linkStyle: 'reference', escapeSpecialChars: true } }, /** * Loose configuration - minimal filtering, preserve most content */ loose: { filter: { threshold: 1, strategy: 'dynamic', ratio: 0.30, minWords: 0, preserveStructure: true, removeElements: ['script', 'style', 'noscript'] }, converter: { citations: true, ignoreLinks: false, ignoreImages: false, format: 'github', linkStyle: 'inline', escapeSpecialChars: false } } }; /** * Get a preset configuration by name * @param name Preset name * @returns Preset configuration options */ function getPreset(name) { const preset = exports.presets[name]; if (!preset) { throw new Error(`Unknown preset: ${name}`); } return JSON.parse(JSON.stringify(preset)); // Deep clone to prevent mutations } exports.getPreset = getPreset; /** * Get all available preset names * @returns Array of preset names */ function getPresetNames() { return Object.keys(exports.presets); } exports.getPresetNames = getPresetNames; /** * Check if a preset exists * @param name Preset name to check * @returns True if preset exists */ function hasPreset(name) { return name in exports.presets; } exports.hasPreset = hasPreset; /** * Merge custom options with a preset * @param presetName Preset to use as base * @param customOptions Custom options to merge * @returns Merged configuration */ function mergeWithPreset(presetName, customOptions) { const preset = getPreset(presetName); return Object.assign(Object.assign(Object.assign({}, preset), customOptions), { filter: Object.assign(Object.assign({}, preset.filter), customOptions.filter), converter: Object.assign(Object.assign({}, preset.converter), customOptions.converter) }); } exports.mergeWithPreset = mergeWithPreset;