html-content-processor
Version:
A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.
387 lines (386 loc) • 15.4 kB
JavaScript
"use strict";
/**
* DOM Adapter - Provides unified DOM API for both browser and Node.js environments
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.isBrowser = exports.isNode = exports.getNode = exports.getNodeFilter = exports.getDocument = exports.getDOMParser = exports.parseHTML = exports.domAdapter = exports.NODE_TYPES = void 0;
// Node type constants for cross-environment compatibility
exports.NODE_TYPES = {
TEXT_NODE: 3,
ELEMENT_NODE: 1,
COMMENT_NODE: 8
};
/**
* Environment detection utility
*/
class EnvironmentDetector {
static detectEnvironment() {
var _a, _b, _c;
// Use globalThis for modern environment detection
if (typeof globalThis !== 'undefined') {
// Use proper type checking for globalThis properties
const global = globalThis;
if (typeof global.document !== 'undefined' && typeof global.window !== 'undefined') {
return 'browser';
}
if (typeof global.process !== 'undefined' && ((_b = (_a = global.process) === null || _a === void 0 ? void 0 : _a.versions) === null || _b === void 0 ? void 0 : _b.node)) {
return 'node';
}
if (typeof global.WorkerGlobalScope !== 'undefined') {
return 'webworker';
}
}
// Fallback detection
if (typeof window !== 'undefined')
return 'browser';
if (typeof process !== 'undefined' && ((_c = process.versions) === null || _c === void 0 ? void 0 : _c.node))
return 'node';
// Check for web worker environment safely
if (typeof self !== 'undefined' && typeof self.importScripts !== 'undefined')
return 'webworker';
return 'unknown';
}
static detectDOMParser() {
try {
return typeof DOMParser !== 'undefined' && new DOMParser() instanceof DOMParser;
}
catch (_a) {
return false;
}
}
static detectDocument() {
return typeof document !== 'undefined' &&
(document === null || document === void 0 ? void 0 : document.createElement) &&
typeof document.createElement === 'function';
}
}
/**
* Browser DOM Adapter
*/
class BrowserAdapter {
static isSupported() {
return EnvironmentDetector.detectEnvironment() === 'browser' &&
EnvironmentDetector.detectDOMParser() &&
EnvironmentDetector.detectDocument();
}
static createWindow() {
if (!this.isSupported()) {
throw new Error('Browser environment does not support required DOM APIs');
}
return {
DOMParser: window.DOMParser,
document: window.document,
NodeFilter: window.NodeFilter,
Node: window.Node
};
}
}
/**
* Node.js DOM Adapter
*/
class NodeAdapter {
static isSupported() {
return EnvironmentDetector.detectEnvironment() === 'node';
}
static createWindow() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.isSupported()) {
throw new Error('Not in Node.js environment');
}
// Try to load jsdom using dynamic import
const jsdom = yield this.loadJSDOM();
if (jsdom) {
const { JSDOM } = jsdom;
const dom = new JSDOM('<!DOCTYPE html><html><body></body></html>');
return {
DOMParser: dom.window.DOMParser,
document: dom.window.document,
NodeFilter: dom.window.NodeFilter,
Node: dom.window.Node
};
}
// Fallback to basic implementation
return this.createFallbackWindow();
});
}
static loadJSDOM() {
return __awaiter(this, void 0, void 0, function* () {
if (this.jsdomAvailable === false) {
return null;
}
if (this.jsdomCache) {
return this.jsdomCache;
}
try {
// Use dynamic import instead of eval('require')
// This works in both CommonJS and ESM environments
const jsdom = yield this.dynamicImport('jsdom');
this.jsdomCache = jsdom;
this.jsdomAvailable = true;
return jsdom;
}
catch (error) {
console.warn('jsdom not found. Installing jsdom is recommended for better performance: npm install jsdom');
this.jsdomAvailable = false;
return null;
}
});
}
static dynamicImport(moduleName) {
return __awaiter(this, void 0, void 0, function* () {
var _a;
try {
// Modern dynamic import approach
return yield (_a = moduleName, Promise.resolve().then(() => __importStar(require(_a))));
}
catch (importError) {
try {
// Fallback for environments that might need require
// Use Function constructor to avoid bundler analysis
const requireFn = new Function('moduleName', 'return require(moduleName)');
return requireFn(moduleName);
}
catch (requireError) {
const errorMessage = (importError === null || importError === void 0 ? void 0 : importError.message) || 'Unknown import error';
throw new Error(`Cannot load module ${moduleName}: ${errorMessage}`);
}
}
});
}
static createFallbackWindow() {
// Minimal fallback implementation using basic HTML parsing
const createBasicParser = () => {
return {
parseFromString: (str, type) => {
// Very basic HTML parsing fallback
// This is a simplified implementation - jsdom is strongly recommended
const mockDoc = {
documentElement: {
outerHTML: str,
innerHTML: str,
textContent: str.replace(/<[^>]*>/g, ''),
querySelectorAll: () => [],
querySelector: () => null,
remove: () => { },
children: [],
childNodes: [],
parentNode: null,
ownerDocument: null
},
body: {
innerHTML: str,
textContent: str.replace(/<[^>]*>/g, ''),
querySelectorAll: () => [],
querySelector: () => null,
children: [],
childNodes: [],
getElementsByTagName: () => [],
remove: () => { }
},
createNodeIterator: () => ({
nextNode: () => null
}),
createElement: (tag) => ({
tagName: tag.toUpperCase(),
innerHTML: '',
textContent: '',
remove: () => { },
children: [],
childNodes: [],
parentNode: null
}),
getElementsByTagName: () => []
};
return mockDoc;
}
};
};
return {
DOMParser: createBasicParser,
document: {
createNodeIterator: () => ({ nextNode: () => null }),
createElement: (tag) => ({ tagName: tag, remove: () => { } }),
body: null
},
NodeFilter: {
SHOW_COMMENT: 128
},
Node: exports.NODE_TYPES
};
}
static hasJSDOM() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.isSupported())
return false;
try {
yield this.loadJSDOM();
return this.jsdomAvailable === true;
}
catch (_a) {
return false;
}
});
}
}
NodeAdapter.jsdomCache = null;
NodeAdapter.jsdomAvailable = null;
/**
* Main DOM Adapter with automatic environment detection
*/
class DOMAdapter {
constructor() {
this._window = null;
this._initialized = false;
this._environment = EnvironmentDetector.detectEnvironment();
}
static getInstance() {
if (!DOMAdapter.instance) {
DOMAdapter.instance = new DOMAdapter();
}
return DOMAdapter.instance;
}
get isNode() {
return this._environment === 'node';
}
get isBrowser() {
return this._environment === 'browser';
}
get isWebWorker() {
return this._environment === 'webworker';
}
ensureInitialized() {
return __awaiter(this, void 0, void 0, function* () {
if (this._initialized)
return;
if (BrowserAdapter.isSupported()) {
this._window = BrowserAdapter.createWindow();
}
else if (NodeAdapter.isSupported()) {
this._window = yield NodeAdapter.createWindow();
}
else {
throw new Error(`Unsupported environment: ${this._environment}`);
}
this._initialized = true;
});
}
getDOMParser() {
return __awaiter(this, void 0, void 0, function* () {
yield this.ensureInitialized();
if (!this._window) {
throw new Error('DOM environment not properly initialized');
}
return new this._window.DOMParser();
});
}
getDocument() {
return __awaiter(this, void 0, void 0, function* () {
yield this.ensureInitialized();
if (!this._window) {
throw new Error('DOM environment not properly initialized');
}
return this._window.document;
});
}
getNodeFilter() {
return __awaiter(this, void 0, void 0, function* () {
yield this.ensureInitialized();
if (!this._window) {
throw new Error('DOM environment not properly initialized');
}
return this._window.NodeFilter;
});
}
getNode() {
return __awaiter(this, void 0, void 0, function* () {
yield this.ensureInitialized();
if (!this._window) {
throw new Error('DOM environment not properly initialized');
}
return this._window.Node || exports.NODE_TYPES;
});
}
/**
* Parse HTML string to Document
*/
parseHTML(html) {
return __awaiter(this, void 0, void 0, function* () {
const parser = yield this.getDOMParser();
return parser.parseFromString(html, 'text/html');
});
}
/**
* Check if jsdom is available and properly loaded
*/
hasJSDOM() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.isNode)
return false;
return yield NodeAdapter.hasJSDOM();
});
}
/**
* Get environment information
*/
getEnvironmentInfo() {
return __awaiter(this, void 0, void 0, function* () {
return {
environment: this._environment,
isNode: this.isNode,
isBrowser: this.isBrowser,
isWebWorker: this.isWebWorker,
hasJSDOM: yield this.hasJSDOM(),
hasNativeDOM: this.isBrowser && EnvironmentDetector.detectDocument()
};
});
}
}
// Export singleton instance
exports.domAdapter = DOMAdapter.getInstance();
// Export convenience functions - now async due to dynamic loading
const parseHTML = (html) => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.parseHTML(html); });
exports.parseHTML = parseHTML;
const getDOMParser = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getDOMParser(); });
exports.getDOMParser = getDOMParser;
const getDocument = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getDocument(); });
exports.getDocument = getDocument;
const getNodeFilter = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getNodeFilter(); });
exports.getNodeFilter = getNodeFilter;
const getNode = () => __awaiter(void 0, void 0, void 0, function* () { return yield exports.domAdapter.getNode(); });
exports.getNode = getNode;
const isNode = () => exports.domAdapter.isNode;
exports.isNode = isNode;
const isBrowser = () => exports.domAdapter.isBrowser;
exports.isBrowser = isBrowser;