UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

387 lines (386 loc) 15.4 kB
"use strict"; /** * DOM Adapter - Provides unified DOM API for both browser and Node.js environments */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.isBrowser = exports.isNode = exports.getNode = exports.getNodeFilter = exports.getDocument = exports.getDOMParser = exports.parseHTML = exports.domAdapter = exports.NODE_TYPES = void 0; // Node type constants for cross-environment compatibility exports.NODE_TYPES = { TEXT_NODE: 3, ELEMENT_NODE: 1, COMMENT_NODE: 8 }; /** * Environment detection utility */ class EnvironmentDetector { static detectEnvironment() { var _a, _b, _c; // Use globalThis for modern environment detection if (typeof globalThis !== 'undefined') { // Use proper type checking for globalThis properties const global = globalThis; if (typeof global.document !== 'undefined' && typeof global.window !== 'undefined') { return 'browser'; } if (typeof global.process !== 'undefined' && ((_b = (_a = global.process) === null || _a === void 0 ? void 0 : _a.versions) === null || _b === void 0 ? void 0 : _b.node)) { return 'node'; } if (typeof global.WorkerGlobalScope !== 'undefined') { return 'webworker'; } } // Fallback detection if (typeof window !== 'undefined') return 'browser'; if (typeof process !== 'undefined' && ((_c = process.versions) === null || _c === void 0 ? void 0 : _c.node)) return 'node'; // Check for web worker environment safely if (typeof self !== 'undefined' && typeof self.importScripts !== 'undefined') return 'webworker'; return 'unknown'; } static detectDOMParser() { try { return typeof DOMParser !== 'undefined' && new DOMParser() instanceof DOMParser; } catch (_a) { return false; } } static detectDocument() { return typeof document !== 'undefined' && (document === null || document === void 0 ? void 0 : document.createElement) && typeof document.createElement === 'function'; } } /** * Browser DOM Adapter */ class BrowserAdapter { static isSupported() { return EnvironmentDetector.detectEnvironment() === 'browser' && EnvironmentDetector.detectDOMParser() && EnvironmentDetector.detectDocument(); } static createWindow() { if (!this.isSupported()) { throw new Error('Browser environment does not support required DOM APIs'); } return { DOMParser: window.DOMParser, document: window.document, NodeFilter: window.NodeFilter, Node: window.Node }; } } /** * Node.js DOM Adapter */ class NodeAdapter { static isSupported() { return EnvironmentDetector.detectEnvironment() === 'node'; } static createWindow() { return __awaiter(this, void 0, void 0, function* () { if (!this.isSupported()) { throw new Error('Not in Node.js environment'); } // Try to load jsdom using dynamic import const jsdom = yield this.loadJSDOM(); if (jsdom) { const { JSDOM } = jsdom; const dom = new JSDOM('<!DOCTYPE html><html><body></body></html>'); return { DOMParser: dom.window.DOMParser, document: dom.window.document, NodeFilter: dom.window.NodeFilter, Node: dom.window.Node }; } // Fallback to basic implementation return this.createFallbackWindow(); }); } static loadJSDOM() { return __awaiter(this, void 0, void 0, function* () { if (this.jsdomAvailable === false) { return null; } if (this.jsdomCache) { return this.jsdomCache; } try { // Use dynamic import instead of eval('require') // This works in both CommonJS and ESM environments const jsdom = yield this.dynamicImport('jsdom'); this.jsdomCache = jsdom; this.jsdomAvailable = true; return jsdom; } catch (error) { console.warn('jsdom not found. Installing jsdom is recommended for better performance: npm install jsdom'); this.jsdomAvailable = false; return null; } }); } static dynamicImport(moduleName) { return __awaiter(this, void 0, void 0, function* () { var _a; try { // Modern dynamic import approach return yield (_a = moduleName, Promise.resolve().then(() => __importStar(require(_a)))); } catch (importError) { try { // Fallback for environments that might need require // Use Function constructor to avoid bundler analysis const requireFn = new Function('moduleName', 'return require(moduleName)'); return requireFn(moduleName); } catch (requireError) { const errorMessage = (importError === null || importError === void 0 ? void 0 : importError.message) || 'Unknown import error'; throw new Error(`Cannot load module ${moduleName}: ${errorMessage}`); } } }); } static createFallbackWindow() { // Minimal fallback implementation using basic HTML parsing const createBasicParser = () => { return { parseFromString: (str, type) => { // Very basic HTML parsing fallback // This is a simplified implementation - jsdom is strongly recommended const mockDoc = { documentElement: { outerHTML: str, innerHTML: str, textContent: str.replace(/<[^>]*>/g, ''), querySelectorAll: () => [], querySelector: () => null, remove: () => { }, children: [], childNodes: [], parentNode: null, ownerDocument: null }, body: { innerHTML: str, textContent: str.replace(/<[^>]*>/g, ''), querySelectorAll: () => [], querySelector: () => null, children: [], childNodes: [], getElementsByTagName: () => [], remove: () => { } }, createNodeIterator: () => ({ nextNode: () => null }), createElement: (tag) => ({ tagName: tag.toUpperCase(), innerHTML: '', textContent: '', remove: () => { }, children: [], childNodes: [], parentNode: null }), getElementsByTagName: () => [] }; return mockDoc; } }; }; return { DOMParser: createBasicParser, document: { createNodeIterator: () => ({ nextNode: () => null }), createElement: (tag) => ({ tagName: tag, remove: () => { } }), body: null }, NodeFilter: { SHOW_COMMENT: 128 }, Node: exports.NODE_TYPES }; } static hasJSDOM() { return __awaiter(this, void 0, void 0, function* () { if (!this.isSupported()) return false; try { yield this.loadJSDOM(); return this.jsdomAvailable === true; } catch (_a) { return false; } }); } } NodeAdapter.jsdomCache = null; NodeAdapter.jsdomAvailable = null; /** * Main DOM Adapter with automatic environment detection */ class DOMAdapter { constructor() { this._window = null; this._initialized = false; this._environment = EnvironmentDetector.detectEnvironment(); } static getInstance() { if (!DOMAdapter.instance) { DOMAdapter.instance = new DOMAdapter(); } return DOMAdapter.instance; } get isNode() { return this._environment === 'node'; } get isBrowser() { return this._environment === 'browser'; } get isWebWorker() { return this._environment === 'webworker'; } ensureInitialized() { return __awaiter(this, void 0, void 0, function* () { if (this._initialized) return; if (BrowserAdapter.isSupported()) { this._window = BrowserAdapter.createWindow(); } else if (NodeAdapter.isSupported()) { this._window = yield NodeAdapter.createWindow(); } else { throw new Error(`Unsupported environment: ${this._environment}`); } this._initialized = true; }); } getDOMParser() { return __awaiter(this, void 0, void 0, function* () { yield this.ensureInitialized(); if (!this._window) { throw new Error('DOM environment not properly initialized'); } return new this._window.DOMParser(); }); } getDocument() { return __awaiter(this, void 0, void 0, function* () { yield this.ensureInitialized(); if (!this._window) { throw new Error('DOM environment not properly initialized'); } return this._window.document; }); } getNodeFilter() { return __awaiter(this, void 0, void 0, function* () { yield this.ensureInitialized(); if (!this._window) { throw new Error('DOM environment not properly initialized'); } return this._window.NodeFilter; }); } getNode() { return __awaiter(this, void 0, void 0, function* () { yield this.ensureInitialized(); if (!this._window) { throw new Error('DOM environment not properly initialized'); } return this._window.Node || exports.NODE_TYPES; }); } /** * Parse HTML string to Document */ parseHTML(html) { return __awaiter(this, void 0, void 0, function* () { const parser = yield this.getDOMParser(); return parser.parseFromString(html, 'text/html'); }); } /** * Check if jsdom is available and properly loaded */ hasJSDOM() { return __awaiter(this, void 0, void 0, function* () { if (!this.isNode) return false; return yield NodeAdapter.hasJSDOM(); }); } /** * Get environment information */ getEnvironmentInfo() { return __awaiter(this, void 0, void 0, function* () { return { environment: this._environment, isNode: this.isNode, isBrowser: this.isBrowser, isWebWorker: this.isWebWorker, hasJSDOM: yield this.hasJSDOM(), hasNativeDOM: this.isBrowser && EnvironmentDetector.detectDocument() }; }); } } // Export singleton instance exports.domAdapter = DOMAdapter.getInstance(); // Export convenience functions - now async due to dynamic loading const parseHTML = (html) => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.parseHTML(html); }); exports.parseHTML = parseHTML; const getDOMParser = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getDOMParser(); }); exports.getDOMParser = getDOMParser; const getDocument = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getDocument(); }); exports.getDocument = getDocument; const getNodeFilter = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getNodeFilter(); }); exports.getNodeFilter = getNodeFilter; const getNode = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getNode(); }); exports.getNode = getNode; const isNode = () => exports.domAdapter.isNode; exports.isNode = isNode; const isBrowser = () => exports.domAdapter.isBrowser; exports.isBrowser = isBrowser;