@hanivanrizky/nestjs-html-parser
Version:
A powerful NestJS HTML parsing service with XPath and CSS selector support, proxy configuration, random user agents, and rich response metadata including headers and status codes
1,205 lines • 85.8 kB
JavaScript
"use strict";
var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) {
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
return c > 3 && r && Object.defineProperty(target, key, r), r;
};
var __metadata = (this && this.__metadata) || function (k, v) {
if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v);
};
var __param = (this && this.__param) || function (paramIndex, decorator) {
return function (target, key) { decorator(target, key, paramIndex); }
};
var HtmlParserService_1;
Object.defineProperty(exports, "__esModule", { value: true });
exports.HtmlParserService = void 0;
const common_1 = require("@nestjs/common");
const axios_1 = require("axios");
const cheerio = require("cheerio");
const https_proxy_agent_1 = require("https-proxy-agent");
const jsdom_1 = require("jsdom");
const socks_proxy_agent_1 = require("socks-proxy-agent");
const html_parser_config_1 = require("./html-parser.config");
/**
* HTML Parser Service for NestJS
*
* A powerful service for parsing HTML content with support for:
* - XPath and CSS selector extraction
* - Proxy configuration with authentication
* - Random user agent rotation
* - Retry logic with configurable delays
* - Verbose logging for debugging
* - Rich response metadata including headers and status codes
*
* @example
* ```typescript
* const parser = new HtmlParserService();
*
* // Fetch HTML with options
* const response = await parser.fetchHtml('https://example.com', {
* timeout: 10000,
* useRandomUserAgent: true,
* verbose: true
* });
*
* // Extract data using XPath
* const title = parser.extractSingle(response.data, '//title/text()');
*
* // Extract structured data
* const articles = parser.extractStructuredList(response.data, '//article', {
* title: { selector: './/h2/text()', type: 'xpath' },
* link: { selector: './/a', type: 'xpath', attribute: 'href' }
* });
* ```
*/
let HtmlParserService = HtmlParserService_1 = class HtmlParserService {
logger;
loggerLevel;
/**
* Default configuration options for HTML parsing operations
*/
defaultOptions = {
timeout: 10000,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
useRandomUserAgent: false,
retries: 3,
retryDelay: 1000,
verbose: false,
rejectUnauthorized: true,
ignoreSSLErrors: false,
disableServerIdentityCheck: false,
maxRedirects: 5,
retryOnErrors: {
ssl: false,
timeout: true,
dns: true,
connectionRefused: true,
},
};
/**
* Initialize the HTML Parser Service
*/
constructor(loggerLevel) {
this.logger = new common_1.Logger(HtmlParserService_1.name, { timestamp: true });
this.loggerLevel = loggerLevel || ['log', 'error', 'debug'];
}
/**
* Helper to check if a log level should be logged
* Fixed to properly handle both single and array logger configurations
*/
shouldLog(level) {
// Define the logging level hierarchy
const levelHierarchy = [
'error',
'warn',
'log',
'debug',
'verbose',
];
// Normalize the input level to LogLevel type
const targetLevel = level;
if (Array.isArray(this.loggerLevel)) {
// If loggerLevel is an array, check if the target level is included
return this.loggerLevel.includes(targetLevel);
}
else {
// If loggerLevel is a single level, use hierarchy
const currentIndex = levelHierarchy.indexOf(this.loggerLevel);
const targetIndex = levelHierarchy.indexOf(targetLevel);
// Return true if target level has higher or equal priority
return (currentIndex !== -1 && targetIndex !== -1 && targetIndex <= currentIndex);
}
}
/**
* Helper method to log with proper level checking
*/
logWithLevel(level, message, ...optionalParams) {
if (this.shouldLog(level)) {
switch (level) {
case 'error':
this.logger.error(message, ...optionalParams);
break;
case 'warn':
this.logger.warn(message, ...optionalParams);
break;
case 'log':
this.logger.log(message, ...optionalParams);
break;
case 'debug':
this.logger.debug(message, ...optionalParams);
break;
case 'verbose':
this.logger.verbose(message, ...optionalParams);
break;
default:
this.logger.log(message, ...optionalParams);
}
}
}
/**
* Suppress console output when verbose is false
*/
suppressConsole() {
// No longer needed as we're using NestJS Logger
}
/**
* Restore console output
*/
restoreConsole() {
// No longer needed as we're using NestJS Logger
}
/**
* Fetch HTML content from a URL with comprehensive configuration options
*
* Supports proxy configuration, custom headers, user agent rotation,
* retry logic, SSL error handling, and rich response metadata. Automatically handles
* different proxy types (HTTP, HTTPS, SOCKS4, SOCKS5) and provides
* detailed error information on failures.
*
* @param url - The URL to fetch HTML content from
* @param options - Configuration options for the request
* @param options.timeout - Request timeout in milliseconds (default: 10000)
* @param options.headers - Custom headers to send with the request
* @param options.userAgent - Custom user agent string
* @param options.useRandomUserAgent - Use a random user agent instead of specified one
* @param options.proxy - Proxy configuration for the request
* @param options.retries - Number of retry attempts on failure (default: 3)
* @param options.retryDelay - Delay between retries in milliseconds (default: 1000)
* @param options.verbose - Enable verbose logging for debugging
* @param options.rejectUnauthorized - Reject unauthorized SSL certificates (default: true)
* @param options.ignoreSSLErrors - Skip SSL certificate verification entirely
* @param options.disableServerIdentityCheck - Disable server name indication (SNI) validation
* @param options.maxRedirects - Maximum number of redirects to follow (default: 5)
* @param options.retryOnErrors - Configure retry behavior for specific error types
*
* @returns Promise resolving to HtmlFetchResponse with HTML content, headers, and status
*
* @throws Error when all retry attempts fail
*
* @example
* ```typescript
* // Basic usage
* const response = await parser.fetchHtml('https://example.com');
*
* // Handle SSL errors for sites with invalid certificates
* const response = await parser.fetchHtml('https://self-signed-site.com', {
* rejectUnauthorized: false,
* retryOnErrors: { ssl: true }
* });
*
* // Ignore SSL completely (use with caution)
* const response = await parser.fetchHtml('https://expired-cert-site.com', {
* ignoreSSLErrors: true
* });
*
* // Disable only server identity validation (for hostname mismatches)
* const response = await parser.fetchHtml('https://hostname-mismatch-site.com', {
* disableServerIdentityCheck: true
* });
*
* // Robust configuration for unreliable sites
* const response = await parser.fetchHtml('https://unreliable-site.com', {
* retries: 5,
* retryDelay: 2000,
* timeout: 15000,
* retryOnErrors: {
* ssl: true,
* timeout: true,
* dns: true,
* connectionRefused: true
* }
* });
* ```
*/
async fetchHtml(url, options) {
const config = { ...this.defaultOptions, ...options };
let lastError;
const maxRetries = config.retries ?? this.defaultOptions.retries ?? 3;
const retryDelay = config.retryDelay ?? this.defaultOptions.retryDelay ?? 1000;
if (config.verbose) {
this.logWithLevel('debug', `🌐 Fetching URL: ${url}`);
this.logWithLevel('debug', `🔧 Configuration:`, {
timeout: config.timeout,
retries: maxRetries,
rejectUnauthorized: config.rejectUnauthorized,
ignoreSSLErrors: config.ignoreSSLErrors,
disableServerIdentityCheck: config.disableServerIdentityCheck,
maxRedirects: config.maxRedirects,
});
}
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
if (config.verbose && attempt > 0) {
this.logWithLevel('debug', `🔄 Retry attempt ${attempt}/${maxRetries}`);
}
// Get user agent - either random or specified
const userAgent = config.useRandomUserAgent
? await this.getRandomUserAgent()
: config.userAgent;
// Create axios config with SSL handling
const axiosConfig = {
timeout: config.timeout,
maxRedirects: config.maxRedirects ?? 5,
headers: {
'User-Agent': userAgent,
...config.headers,
},
// SSL configuration - enhanced to handle modern SSL/TLS issues
httpsAgent: new (require('https').Agent)({
rejectUnauthorized: config.ignoreSSLErrors
? false
: (config.rejectUnauthorized ?? true),
// Use more modern and compatible TLS settings when ignoring SSL errors
secureProtocol: config.ignoreSSLErrors ? undefined : undefined,
minVersion: config.ignoreSSLErrors ? 'TLSv1' : undefined,
maxVersion: config.ignoreSSLErrors ? 'TLSv1.3' : undefined,
// Add cipher support for legacy servers
ciphers: config.ignoreSSLErrors
? 'ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA'
: undefined,
// Configure server name indication validation for problematic sites
...(config.disableServerIdentityCheck
? { checkServerIdentity: () => undefined }
: {}), // When false or undefined, Node.js uses default tls.checkServerIdentity
}),
};
// Add proxy configuration if provided
if (config.proxy) {
axiosConfig.httpAgent = this.createProxyAgent(config.proxy, false);
axiosConfig.httpsAgent = this.createProxyAgent(config.proxy, true);
}
const response = await axios_1.default.get(url, axiosConfig);
if (config.verbose) {
this.logWithLevel('debug', `✅ Successfully fetched ${url} (${response.status} ${response.statusText})`);
}
return {
data: response.data,
headers: this.normalizeHeaders(response.headers),
status: response.status,
statusText: response.statusText,
};
}
catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
const errorInfo = this.categorizeError(lastError);
if (this.shouldLog('error')) {
this.logWithLevel('error', `❌ Attempt ${attempt + 1} failed: ${errorInfo.type} - ${lastError.message}`);
}
// Check if we should retry based on error type
const shouldRetry = this.shouldRetryOnError(errorInfo, config);
if (config.verbose) {
this.logWithLevel('debug', `🤔 Should retry: ${shouldRetry}, Attempts left: ${maxRetries - attempt}`);
}
// If this is not the last attempt and we should retry this error type
if (attempt < maxRetries && shouldRetry) {
if (config.verbose) {
this.logWithLevel('debug', `⏳ Waiting ${retryDelay}ms before retry...`);
}
await this.delay(retryDelay);
continue;
}
// If we shouldn't retry this error type, break early
if (!shouldRetry) {
if (config.verbose) {
this.logWithLevel('debug', `🚫 Not retrying ${errorInfo.type} error`);
}
break;
}
}
}
// Enhanced error message with categorized error info
const errorInfo = lastError
? this.categorizeError(lastError)
: { type: 'unknown', description: 'Unknown error' };
throw new Error(`Failed to fetch HTML from ${url} after ${maxRetries + 1} attempts. ` +
`Error type: ${errorInfo.type}. ${errorInfo.description}. ` +
`Last error: ${lastError?.message || 'Unknown error'}`);
}
/**
* Create a proxy agent based on proxy configuration
*/
createProxyAgent(proxy, isHttps) {
if (!proxy.url || proxy.url.trim() === '') {
throw new Error('Proxy URL cannot be empty');
}
let proxyUrl = proxy.url;
try {
const url = new URL(proxy.url);
// If separate username/password are provided, they take precedence
if (proxy.username && proxy.password) {
url.username = proxy.username;
url.password = proxy.password;
}
// If URL already contains credentials and no separate creds provided, keep them
// (URL constructor automatically parses user:pass@host format)
proxyUrl = url.toString();
}
catch (error) {
// If URL parsing fails, try to construct a basic URL
// This handles cases where the URL might be in a non-standard format
if (proxy.username && proxy.password) {
// Try to add credentials to potentially malformed URL
const hasProtocol = proxy.url.includes('://');
if (hasProtocol) {
const [protocol, rest] = proxy.url.split('://');
proxyUrl = `${protocol}://${proxy.username}:${proxy.password}@${rest}`;
}
else {
// Assume http if no protocol specified
proxyUrl = `http://${proxy.username}:${proxy.password}@${proxy.url}`;
}
}
}
// Determine proxy type from URL if not specified
const proxyType = proxy.type || this.detectProxyType(proxy.url);
switch (proxyType) {
case 'socks4':
case 'socks5':
return new socks_proxy_agent_1.SocksProxyAgent(proxyUrl);
case 'http':
case 'https':
default:
return new https_proxy_agent_1.HttpsProxyAgent(proxyUrl);
}
}
/**
* Detect proxy type from URL
*/
detectProxyType(url) {
const protocol = url.split('://')[0].toLowerCase();
switch (protocol) {
case 'socks4':
case 'socks5':
return protocol;
case 'http':
case 'https':
return protocol;
default:
return 'http';
}
}
/**
* Delay function for retries
*/
delay(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Generate a random user agent string
*
* Returns a realistic user agent string selected randomly from a pool
* of current browser user agents. Useful for avoiding detection when
* scraping websites that block requests with default user agents.
* Falls back to a default user agent if the random generation fails.
*
* @returns Promise resolving to a random user agent string
*
* @example
* ```typescript
* // Get a random user agent
* const userAgent = await parser.getRandomUserAgent();
* console.log(userAgent);
* // Result: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36..."
*
* // Use with fetchHtml for better stealth
* const response = await parser.fetchHtml('https://example.com', {
* useRandomUserAgent: true // This uses getRandomUserAgent() internally
* });
*
* // Or manually
* const customUserAgent = await parser.getRandomUserAgent();
* const response = await parser.fetchHtml('https://example.com', {
* userAgent: customUserAgent
* });
* ```
*/
async getRandomUserAgent() {
try {
const { randUA } = await Promise.resolve().then(() => require('@ahmedrangel/rand-user-agent'));
return randUA();
}
catch (error) {
// Fallback to default user agent if dynamic import fails
return (this.defaultOptions.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
}
}
/**
* Test proxy connection and authentication
*
* Validates that a proxy configuration is working by attempting
* to fetch a test URL through the proxy. Useful for verifying
* proxy credentials and connectivity before using it for actual
* HTML parsing operations.
*
* @param proxy - Proxy configuration to test
* @param testUrl - URL to use for testing proxy connection (default: 'https://httpbin.org/ip')
*
* @returns Promise resolving to true if proxy works, false otherwise
*
* @example
* ```typescript
* const proxyConfig = {
* url: 'http://proxy.example.com:8080',
* username: 'user',
* password: 'pass'
* };
*
* const isWorking = await parser.testProxy(proxyConfig);
* if (isWorking) {
* console.log('Proxy is working!');
* } else {
* console.log('Proxy failed or authentication invalid');
* }
*
* // Test with custom URL
* const isWorking = await parser.testProxy(proxyConfig, 'https://example.com');
* ```
*/
async testProxy(proxy, testUrl = 'https://httpbin.org/ip') {
try {
await this.fetchHtml(testUrl, {
proxy,
timeout: 5000,
retries: 0,
});
return true;
}
catch (error) {
return false;
}
}
/**
* Extract a single value from HTML using XPath or CSS selectors
*
* Extracts the first matching element's text content or attribute value.
* Supports both XPath expressions (recommended) and CSS selectors.
* Can extract specific attributes from elements or their text content.
*
* @param html - HTML content to parse
* @param selector - XPath expression or CSS selector to locate the element
* @param type - Type of selector: 'xpath' (default) or 'css'
* @param attribute - HTML attribute to extract (optional, extracts text content if not specified)
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns The extracted text/attribute value, or null if no match found
*
* @example
* ```typescript
* const html = '<div><h1 id="title">Welcome</h1><a href="/home">Home</a></div>';
*
* // Extract text content using XPath
* const title = parser.extractSingle(html, '//h1[@id="title"]/text()');
* // Result: "Welcome"
*
* // Extract attribute using XPath
* const link = parser.extractSingle(html, '//a', 'xpath', 'href');
* // Result: "/home"
*
* // Extract using CSS selector
* const titleCSS = parser.extractSingle(html, 'h1#title', 'css');
* // Result: "Welcome"
*
* // With type safety and transformation
* const id = parser.extractSingle<number>(html, '//div/@data-id', 'xpath', undefined, {
* transform: (value: string) => parseInt(value)
* });
* // Result: number | null
*
* // With verbose logging
* const result = parser.extractSingle(html, '//h1/text()', 'xpath', undefined, { verbose: true });
* ```
*/
extractSingle(html, selector, type = 'xpath', attribute, options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
if (verbose) {
this.logWithLevel('debug', `🔍 extractSingle - Selector: "${selector}", Type: ${type}, Attribute: ${attribute || 'none'}`);
}
try {
let result;
if (type === 'xpath') {
result = this.extractSingleXPath(html, selector, attribute, verbose);
}
else {
result = this.extractSingleCSS(html, selector, attribute);
}
if (verbose) {
this.logWithLevel('debug', `✅ extractSingle result: ${result ? `"${result.substring(0, 100)}${result.length > 100 ? '...' : ''}"` : 'null'}`);
}
// Apply transformation if provided and result exists
if (result !== null && options?.transform) {
return this.applyTransform(result, options.transform, options.baseUrl);
}
// Return as T if no transform (assumes T extends string when no transform)
return result;
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', '❌ Error in extractSingle:', error);
}
return null;
}
}
/**
* Extract multiple values from HTML using XPath or CSS selectors
*
* Extracts text content or attribute values from all matching elements.
* Returns an array of strings containing all found values. Supports
* both XPath expressions and CSS selectors with attribute extraction.
*
* @param html - HTML content to parse
* @param selector - XPath expression or CSS selector to locate elements
* @param type - Type of selector: 'xpath' (default) or 'css'
* @param attribute - HTML attribute to extract (optional, extracts text content if not specified)
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns Array of extracted text/attribute values (empty array if no matches)
*
* @example
* ```typescript
* const html = `
* <ul>
* <li><a href="/page1">Page 1</a></li>
* <li><a href="/page2">Page 2</a></li>
* <li><a href="/page3">Page 3</a></li>
* </ul>
* `;
*
* // Extract all link texts using XPath
* const linkTexts = parser.extractMultiple(html, '//a/text()');
* // Result: ["Page 1", "Page 2", "Page 3"]
*
* // Extract all href attributes using XPath
* const hrefs = parser.extractMultiple(html, '//a', 'xpath', 'href');
* // Result: ["/page1", "/page2", "/page3"]
*
* // Extract using CSS selector
* const linksCSS = parser.extractMultiple(html, 'li a', 'css');
* // Result: ["Page 1", "Page 2", "Page 3"]
*
* // With type safety and transformation
* const ids = parser.extractMultiple<number>(html, '//li/@data-id', 'xpath', undefined, {
* transform: (value: string) => parseInt(value)
* });
* // Result: number[]
*
* // With verbose logging
* const results = parser.extractMultiple(html, '//li', 'xpath', undefined, { verbose: true });
* ```
*/
extractMultiple(html, selector, type = 'xpath', attribute, options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
if (verbose) {
this.logWithLevel('debug', `🔍 extractMultiple - Selector: "${selector}", Type: ${type}, Attribute: ${attribute || 'none'}`);
}
try {
let results;
if (type === 'xpath') {
results = this.extractMultipleXPath(html, selector, attribute, verbose);
}
else {
results = this.extractMultipleCSS(html, selector, attribute);
}
if (verbose) {
this.logWithLevel('debug', `✅ extractMultiple found ${results.length} results`);
if (results.length > 0 && results.length <= 5) {
results.forEach((result, index) => {
this.logWithLevel('debug', ` ${index + 1}: "${result.substring(0, 80)}${result.length > 80 ? '...' : ''}"`);
});
}
else if (results.length > 5) {
this.logWithLevel('debug', ` First 3 results:`);
results.slice(0, 3).forEach((result, index) => {
this.logWithLevel('debug', ` ${index + 1}: "${result.substring(0, 80)}${result.length > 80 ? '...' : ''}"`);
});
this.logWithLevel('debug', ` ... and ${results.length - 3} more`);
}
}
// Apply transformation if provided
if (options?.transform) {
return this.applyTransform(results, options.transform, options.baseUrl);
}
// Return as T[] if no transform (assumes T extends string when no transform)
return results;
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', '❌ Error in extractMultiple:', error);
}
return [];
}
}
/**
* Extract text content from HTML elements
*
* Convenience method specifically for extracting text content from elements.
* This is equivalent to calling extractSingle without an attribute parameter.
* Useful when you only need the text content and want clearer intent.
*
* @param html - HTML content to parse
* @param selector - XPath expression or CSS selector to locate the element
* @param type - Type of selector: 'xpath' (default) or 'css'
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns The extracted text content, or null if no match found
*
* @example
* ```typescript
* const html = '<div><h1>Main Title</h1><p>Description text</p></div>';
*
* // Extract heading text
* const title = parser.extractText(html, '//h1');
* // Result: "Main Title"
*
* // Extract paragraph text using CSS
* const description = parser.extractText(html, 'p', 'css');
* // Result: "Description text"
*
* // With type safety and transformation
* const wordCount = parser.extractText<number>(html, '//p', 'xpath', {
* transform: (text: string) => text.split(' ').length
* });
* // Result: number | null
*
* // With verbose logging
* const text = parser.extractText(html, '//p/text()', 'xpath', { verbose: true });
* ```
*/
extractText(html, selector, type = 'xpath', options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
try {
let result;
if (type === 'xpath') {
result = this.extractSingleXPath(html, selector, undefined, verbose);
}
else {
result = this.extractSingleCSS(html, selector);
}
// Apply transformation if provided and result exists
if (result !== null && options?.transform) {
return this.applyTransform(result, options.transform, options.baseUrl);
}
// Return as T if no transform (assumes T extends string when no transform)
return result;
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', 'Error in extractText:', error);
}
return null;
}
}
/**
* Extract attribute values from multiple HTML elements
*
* Convenience method for extracting a specific attribute from all matching elements.
* This is equivalent to calling extractMultiple with an attribute parameter.
* Returns all attribute values from elements that match the selector.
*
* @param html - HTML content to parse
* @param selector - XPath expression or CSS selector to locate elements
* @param attribute - HTML attribute name to extract
* @param type - Type of selector: 'xpath' (default) or 'css'
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns Array of attribute values (empty array if no matches or no attribute)
*
* @example
* ```typescript
* const html = `
* <nav>
* <a href="/home" title="Home Page">Home</a>
* <a href="/about" title="About Us">About</a>
* <a href="/contact" title="Contact Form">Contact</a>
* </nav>
* `;
*
* // Extract all href attributes
* const links = parser.extractAttributes(html, '//a', 'href');
* // Result: ["/home", "/about", "/contact"]
*
* // Extract all title attributes
* const titles = parser.extractAttributes(html, '//a', 'title', 'xpath');
* // Result: ["Home Page", "About Us", "Contact Form"]
*
* // Using CSS selector
* const hrefs = parser.extractAttributes(html, 'nav a', 'href', 'css');
* // Result: ["/home", "/about", "/contact"]
*
* // With type safety and transformation
* const ids = parser.extractAttributes<number>(html, '//img', 'data-id', 'xpath', {
* transform: (value: string) => parseInt(value)
* });
* // Result: number[]
* ```
*/
extractAttributes(html, selector, attribute, type = 'xpath', options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
try {
let results;
if (type === 'xpath') {
results = this.extractMultipleXPath(html, selector, attribute, verbose);
}
else {
results = this.extractMultipleCSS(html, selector, attribute);
}
// Apply transformation if provided
if (options?.transform) {
return this.applyTransform(results, options.transform, options.baseUrl);
}
// Return as T[] if no transform (assumes T extends string when no transform)
return results;
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', 'Error in extractAttributes:', error);
}
return [];
}
}
/**
* Check if elements exist in HTML content
*
* Tests whether the specified selector matches any elements in the HTML.
* Useful for conditional logic based on element presence or for validating
* HTML structure before attempting extractions.
*
* @param html - HTML content to search
* @param selector - XPath expression or CSS selector to test
* @param type - Type of selector: 'xpath' (default) or 'css'
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns true if at least one element matches, false otherwise
*
* @example
* ```typescript
* const html = '<div><h1>Title</h1><p class="content">Text</p></div>';
*
* // Check if title exists
* const hasTitle = parser.exists(html, '//h1');
* // Result: true
*
* // Check if specific class exists
* const hasContent = parser.exists(html, '//p[@class="content"]');
* // Result: true
*
* // Check for non-existent element
* const hasFooter = parser.exists(html, '//footer');
* // Result: false
*
* // Using CSS selector
* const hasContentCSS = parser.exists(html, 'p.content', 'css');
* // Result: true
*
* // Conditional extraction based on existence
* if (parser.exists(html, '//nav')) {
* const navigation = parser.extractStructured(html, navSchema);
* }
* ```
*/
exists(html, selector, type = 'xpath', options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
if (verbose && this.shouldLog('debug')) {
this.logWithLevel('debug', `🔍 exists - Checking selector: "${selector}", Type: ${type}`);
}
try {
let result;
if (type === 'xpath') {
const results = this.evaluateXPath(html, selector, verbose);
result = results.length > 0;
}
else {
const $ = cheerio.load(html);
result = $(selector).length > 0;
}
if (verbose && this.shouldLog('debug')) {
this.logWithLevel('debug', `✅ exists result: ${result ? 'Found' : 'Not found'}`);
}
return result;
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', '❌ Error in exists:', error);
}
return false;
}
}
/**
* Count the number of matching elements in HTML content
*
* Returns the total number of elements that match the specified selector.
* Useful for pagination, validation, or determining the size of data sets
* before processing them.
*
* @param html - HTML content to search
* @param selector - XPath expression or CSS selector to count
* @param type - Type of selector: 'xpath' (default) or 'css'
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns Number of matching elements (0 if no matches)
*
* @example
* ```typescript
* const html = `
* <ul>
* <li>Item 1</li>
* <li>Item 2</li>
* <li>Item 3</li>
* </ul>
* <div class="highlight">Special</div>
* `;
*
* // Count list items
* const itemCount = parser.count(html, '//li');
* // Result: 3
*
* // Count elements with specific class
* const highlightCount = parser.count(html, '//div[@class="highlight"]');
* // Result: 1
*
* // Count using CSS selector
* const listItemsCSS = parser.count(html, 'ul li', 'css');
* // Result: 3
*
* // Use count for conditional processing
* const articleCount = parser.count(html, '//article');
* if (articleCount > 10) {
* console.log('Large dataset detected, processing in batches');
* }
* ```
*/
count(html, selector, type = 'xpath', options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
if (verbose && this.shouldLog('debug')) {
this.logWithLevel('debug', `🔍 count - Counting selector: "${selector}", Type: ${type}`);
}
try {
let result;
if (type === 'xpath') {
const results = this.evaluateXPath(html, selector, verbose);
result = results.length;
}
else {
const $ = cheerio.load(html);
result = $(selector).length;
}
if (verbose && this.shouldLog('debug')) {
this.logWithLevel('debug', `✅ count result: ${result} elements found`);
}
return result;
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', '❌ Error in count:', error);
}
return 0;
}
}
/**
* Extract structured data from HTML using a schema definition
*
* Applies a schema object to extract multiple related fields from HTML content.
* Each field in the schema defines its own selector, type, optional attribute,
* transformation function, and now supports 'multiple' and 'raw' flags for array and raw HTML extraction.
*
* If a schema field includes `multiple: true`, the extracted value for that field
* will be an array of results (with transform applied to each item if provided).
* If a schema field includes `raw: true`, the extracted value will be the raw HTML of the matched element(s).
* Otherwise, a single value is returned as before.
*
* @param html - HTML content to parse
* @param schema - Schema object defining fields to extract (see ExtractionSchema)
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns Object with extracted data matching the schema structure. Fields with
* `multiple: true` will be arrays, `raw: true` will be raw HTML, others will be single values.
*
* @example
* ```typescript
* const html = `
* <article>
* <h1>Product Name</h1>
* <span class="price">$29.99</span>
* <img src="/image.jpg" alt="Product">
* <div class="rating" data-stars="4">★★★★☆</div>
* <div class="tags">
* <span class="tag">electronics</span>
* <span class="tag">gadget</span>
* </div>
* </article>
* `;
*
* // Define typed interface
* interface Product {
* title: string;
* price: number;
* image: string;
* rating: number;
* tags: string[]; // <-- array field
* titleHtml: string; // <-- raw HTML field
* }
*
* const productSchema: ExtractionSchema<Product> = {
* title: {
* selector: '//h1/text()',
* type: 'xpath'
* },
* price: {
* selector: '//span[@class="price"]/text()',
* type: 'xpath',
* transform: (price) => parseFloat(price.replace('$', ''))
* },
* image: {
* selector: '//img',
* type: 'xpath',
* attribute: 'src'
* },
* rating: {
* selector: '//div[@class="rating"]',
* type: 'xpath',
* attribute: 'data-stars',
* transform: (stars) => parseInt(stars)
* },
* tags: {
* selector: '//span[@class="tag"]/text()',
* type: 'xpath',
* multiple: true // <-- NEW: array extraction
* },
* titleHtml: {
* selector: '//h1',
* type: 'xpath',
* raw: true // <-- NEW: raw HTML extraction
* }
* };
*
* const product = parser.extractStructured<Product>(html, productSchema);
* // Result: Product type with full type safety
* // {
* // title: "Product Name",
* // price: 29.99,
* // image: "/image.jpg",
* // rating: 4,
* // tags: ["electronics", "gadget"], // <-- array result
* // titleHtml: "<h1>Product Name</h1>" // <-- raw HTML result
* // }
* ```
*/
extractStructured(html, schema, options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
const baseUrl = options?.baseUrl;
const result = {};
if (verbose) {
this.logWithLevel('debug', `🔍 extractStructured - Processing ${Object.keys(schema).length} schema fields`);
}
try {
for (const [key, config] of Object.entries(schema)) {
try {
let value;
const raw = config.raw === true;
if (config.multiple) {
// Use extractMultiple for array fields
if (config.type === 'xpath') {
value = this.extractMultipleXPath(html, config.selector, config.attribute, verbose, raw);
}
else {
if (raw) {
value = this.extractMultipleCSS(html, config.selector, undefined, true);
}
else {
value = this.extractMultipleCSS(html, config.selector, config.attribute);
}
}
// Apply transformation if provided
const transform = config.transform;
if (transform) {
value = this.applyTransform(value, transform, baseUrl);
}
}
else {
// Use extractSingle for single value fields
if (config.type === 'xpath') {
value = this.extractSingleXPath(html, config.selector, config.attribute, verbose, raw);
}
else {
if (raw) {
value = this.extractSingleCSS(html, config.selector, undefined, true);
}
else {
value = this.extractSingleCSS(html, config.selector, config.attribute);
}
}
// Apply transformation if provided
if (value && config.transform) {
value = this.applyTransform(value, config.transform, baseUrl);
}
}
result[key] = value;
if (verbose) {
this.logWithLevel('debug', `✅ Extracted field '${key}': ${value ? 'success' : 'null/empty'}`);
}
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', `❌ Error extracting field '${key}':`, error);
}
result[key] = config.multiple ? [] : null;
}
}
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', '❌ Error in extractStructured:', error);
}
}
return result;
}
/**
* Extract structured data directly from a container node (preserves XPath context)
*
* This method works directly with DOM nodes to preserve the XPath evaluation context,
* allowing relative XPath expressions like './td[2]//a/@href' to work correctly.
*
* @param containerNode - The DOM node to extract data from
* @param schema - Schema object defining fields to extract
* @param options - Parsing options
*/
extractStructuredFromNode(containerNode, schema, options) {
const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false;
const baseUrl = options?.baseUrl;
const result = {};
if (verbose) {
this.logWithLevel('debug', `🔍 extractStructuredFromNode - Processing ${Object.keys(schema).length} schema fields`);
}
try {
for (const [key, config] of Object.entries(schema)) {
try {
let value;
const raw = config.raw === true;
if (config.multiple) {
// Use extractMultiple for array fields
if (config.type === 'xpath') {
value = this.extractMultipleXPathFromNode(containerNode, config.selector, config.attribute, verbose, raw);
}
else {
// CSS selectors need to work with HTML string
const containerHTML = this.getElementHTML(containerNode);
if (raw) {
value = this.extractMultipleCSS(containerHTML, config.selector, undefined, true);
}
else {
value = this.extractMultipleCSS(containerHTML, config.selector, config.attribute);
}
}
// Apply transformation if provided
const transform = config.transform;
if (transform) {
value = this.applyTransform(value, transform, baseUrl);
}
}
else {
// Use extractSingle for single value fields
if (config.type === 'xpath') {
value = this.extractSingleXPathFromNode(containerNode, config.selector, config.attribute, verbose, raw);
}
else {
// CSS selectors need to work with HTML string
const containerHTML = this.getElementHTML(containerNode);
if (raw) {
value = this.extractSingleCSS(containerHTML, config.selector, undefined, true);
}
else {
value = this.extractSingleCSS(containerHTML, config.selector, config.attribute);
}
}
// Apply transformation if provided
if (value && config.transform) {
value = this.applyTransform(value, config.transform, baseUrl);
}
}
result[key] = value;
if (verbose) {
this.logWithLevel('debug', `✅ Extracted field '${key}': ${value ? 'success' : 'null/empty'}`);
}
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', `❌ Error extracting field '${key}':`, error);
}
result[key] = config.multiple ? [] : null;
}
}
}
catch (error) {
if (this.shouldLog('error')) {
this.logWithLevel('error', '❌ Error in extractStructuredFromNode:', error);
}
}
return result;
}
/**
* Extract array of structured data from repeating HTML elements
*
* For each container, applies the schema as in extractStructured. If a schema field
* includes `multiple: true`, the extracted value for that field will be an array of results.
* If a schema field includes `raw: true`, the extracted value will be the raw HTML of the matched element(s).
*
* @param html - HTML content to parse
* @param containerSelector - XPath or CSS selector to find container elements
* @param schema - Schema object defining fields to extract from each container
* @param containerType - Type of container selector: 'xpath' (default) or 'css'
* @param options - Parsing options
* @param options.verbose - Enable verbose logging for debugging
*
* @returns Array of objects with extracted data matching the schema structure. Fields with
* `multiple: true` will be arrays, `raw: true` will be raw HTML, others will be single values.
*
* @example
* ```typescript
* const html = `
* <div class="products">
* <div class="product">
* <h3>Product A</h3>
* <span class="price">$19.99</span>
* <span class="tag">electronics</span>
* <span class="tag">gadget</span>
* </div>
* <div class="product">
* <h3>Product B</h3>
* <span class="price">$29.99</span>
* <span class="tag">accessory</span>
* </div>
* </div>
* `;
*
* // Define typed interface
* interface Product {
* name: string;
* price: number;
* tags: string[]; // <-- array field
* nameHtml: string; // <-- raw HTML field
* }
*
* const productSchema: ExtractionSchema<Product> = {
* name: {
* selector: './/h3/text()',
* type: 'xpath'
* },
* price: {
* selector: './/span[@class="price"]/text()',
* type: 'xpath',
* transform: (value) => parseFloat(value.replace('$', ''))
* },
* tags: {
* selector: './/span[@class="tag"]/text()',
* type: 'xpath',
* multiple: true // <-- NEW: array extraction
* },
*