UNPKG

@hanivanrizky/nestjs-html-parser

Version:

A powerful NestJS HTML parsing service with XPath and CSS selector support, proxy configuration, random user agents, and rich response metadata including headers and status codes

1,205 lines 85.8 kB
"use strict"; var __decorate = (this && this.__decorate) || function (decorators, target, key, desc) { var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d; if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc); else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r; return c > 3 && r && Object.defineProperty(target, key, r), r; }; var __metadata = (this && this.__metadata) || function (k, v) { if (typeof Reflect === "object" && typeof Reflect.metadata === "function") return Reflect.metadata(k, v); }; var __param = (this && this.__param) || function (paramIndex, decorator) { return function (target, key) { decorator(target, key, paramIndex); } }; var HtmlParserService_1; Object.defineProperty(exports, "__esModule", { value: true }); exports.HtmlParserService = void 0; const common_1 = require("@nestjs/common"); const axios_1 = require("axios"); const cheerio = require("cheerio"); const https_proxy_agent_1 = require("https-proxy-agent"); const jsdom_1 = require("jsdom"); const socks_proxy_agent_1 = require("socks-proxy-agent"); const html_parser_config_1 = require("./html-parser.config"); /** * HTML Parser Service for NestJS * * A powerful service for parsing HTML content with support for: * - XPath and CSS selector extraction * - Proxy configuration with authentication * - Random user agent rotation * - Retry logic with configurable delays * - Verbose logging for debugging * - Rich response metadata including headers and status codes * * @example * ```typescript * const parser = new HtmlParserService(); * * // Fetch HTML with options * const response = await parser.fetchHtml('https://example.com', { * timeout: 10000, * useRandomUserAgent: true, * verbose: true * }); * * // Extract data using XPath * const title = parser.extractSingle(response.data, '//title/text()'); * * // Extract structured data * const articles = parser.extractStructuredList(response.data, '//article', { * title: { selector: './/h2/text()', type: 'xpath' }, * link: { selector: './/a', type: 'xpath', attribute: 'href' } * }); * ``` */ let HtmlParserService = HtmlParserService_1 = class HtmlParserService { logger; loggerLevel; /** * Default configuration options for HTML parsing operations */ defaultOptions = { timeout: 10000, userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', useRandomUserAgent: false, retries: 3, retryDelay: 1000, verbose: false, rejectUnauthorized: true, ignoreSSLErrors: false, disableServerIdentityCheck: false, maxRedirects: 5, retryOnErrors: { ssl: false, timeout: true, dns: true, connectionRefused: true, }, }; /** * Initialize the HTML Parser Service */ constructor(loggerLevel) { this.logger = new common_1.Logger(HtmlParserService_1.name, { timestamp: true }); this.loggerLevel = loggerLevel || ['log', 'error', 'debug']; } /** * Helper to check if a log level should be logged * Fixed to properly handle both single and array logger configurations */ shouldLog(level) { // Define the logging level hierarchy const levelHierarchy = [ 'error', 'warn', 'log', 'debug', 'verbose', ]; // Normalize the input level to LogLevel type const targetLevel = level; if (Array.isArray(this.loggerLevel)) { // If loggerLevel is an array, check if the target level is included return this.loggerLevel.includes(targetLevel); } else { // If loggerLevel is a single level, use hierarchy const currentIndex = levelHierarchy.indexOf(this.loggerLevel); const targetIndex = levelHierarchy.indexOf(targetLevel); // Return true if target level has higher or equal priority return (currentIndex !== -1 && targetIndex !== -1 && targetIndex <= currentIndex); } } /** * Helper method to log with proper level checking */ logWithLevel(level, message, ...optionalParams) { if (this.shouldLog(level)) { switch (level) { case 'error': this.logger.error(message, ...optionalParams); break; case 'warn': this.logger.warn(message, ...optionalParams); break; case 'log': this.logger.log(message, ...optionalParams); break; case 'debug': this.logger.debug(message, ...optionalParams); break; case 'verbose': this.logger.verbose(message, ...optionalParams); break; default: this.logger.log(message, ...optionalParams); } } } /** * Suppress console output when verbose is false */ suppressConsole() { // No longer needed as we're using NestJS Logger } /** * Restore console output */ restoreConsole() { // No longer needed as we're using NestJS Logger } /** * Fetch HTML content from a URL with comprehensive configuration options * * Supports proxy configuration, custom headers, user agent rotation, * retry logic, SSL error handling, and rich response metadata. Automatically handles * different proxy types (HTTP, HTTPS, SOCKS4, SOCKS5) and provides * detailed error information on failures. * * @param url - The URL to fetch HTML content from * @param options - Configuration options for the request * @param options.timeout - Request timeout in milliseconds (default: 10000) * @param options.headers - Custom headers to send with the request * @param options.userAgent - Custom user agent string * @param options.useRandomUserAgent - Use a random user agent instead of specified one * @param options.proxy - Proxy configuration for the request * @param options.retries - Number of retry attempts on failure (default: 3) * @param options.retryDelay - Delay between retries in milliseconds (default: 1000) * @param options.verbose - Enable verbose logging for debugging * @param options.rejectUnauthorized - Reject unauthorized SSL certificates (default: true) * @param options.ignoreSSLErrors - Skip SSL certificate verification entirely * @param options.disableServerIdentityCheck - Disable server name indication (SNI) validation * @param options.maxRedirects - Maximum number of redirects to follow (default: 5) * @param options.retryOnErrors - Configure retry behavior for specific error types * * @returns Promise resolving to HtmlFetchResponse with HTML content, headers, and status * * @throws Error when all retry attempts fail * * @example * ```typescript * // Basic usage * const response = await parser.fetchHtml('https://example.com'); * * // Handle SSL errors for sites with invalid certificates * const response = await parser.fetchHtml('https://self-signed-site.com', { * rejectUnauthorized: false, * retryOnErrors: { ssl: true } * }); * * // Ignore SSL completely (use with caution) * const response = await parser.fetchHtml('https://expired-cert-site.com', { * ignoreSSLErrors: true * }); * * // Disable only server identity validation (for hostname mismatches) * const response = await parser.fetchHtml('https://hostname-mismatch-site.com', { * disableServerIdentityCheck: true * }); * * // Robust configuration for unreliable sites * const response = await parser.fetchHtml('https://unreliable-site.com', { * retries: 5, * retryDelay: 2000, * timeout: 15000, * retryOnErrors: { * ssl: true, * timeout: true, * dns: true, * connectionRefused: true * } * }); * ``` */ async fetchHtml(url, options) { const config = { ...this.defaultOptions, ...options }; let lastError; const maxRetries = config.retries ?? this.defaultOptions.retries ?? 3; const retryDelay = config.retryDelay ?? this.defaultOptions.retryDelay ?? 1000; if (config.verbose) { this.logWithLevel('debug', `🌐 Fetching URL: ${url}`); this.logWithLevel('debug', `🔧 Configuration:`, { timeout: config.timeout, retries: maxRetries, rejectUnauthorized: config.rejectUnauthorized, ignoreSSLErrors: config.ignoreSSLErrors, disableServerIdentityCheck: config.disableServerIdentityCheck, maxRedirects: config.maxRedirects, }); } for (let attempt = 0; attempt <= maxRetries; attempt++) { try { if (config.verbose && attempt > 0) { this.logWithLevel('debug', `🔄 Retry attempt ${attempt}/${maxRetries}`); } // Get user agent - either random or specified const userAgent = config.useRandomUserAgent ? await this.getRandomUserAgent() : config.userAgent; // Create axios config with SSL handling const axiosConfig = { timeout: config.timeout, maxRedirects: config.maxRedirects ?? 5, headers: { 'User-Agent': userAgent, ...config.headers, }, // SSL configuration - enhanced to handle modern SSL/TLS issues httpsAgent: new (require('https').Agent)({ rejectUnauthorized: config.ignoreSSLErrors ? false : (config.rejectUnauthorized ?? true), // Use more modern and compatible TLS settings when ignoring SSL errors secureProtocol: config.ignoreSSLErrors ? undefined : undefined, minVersion: config.ignoreSSLErrors ? 'TLSv1' : undefined, maxVersion: config.ignoreSSLErrors ? 'TLSv1.3' : undefined, // Add cipher support for legacy servers ciphers: config.ignoreSSLErrors ? 'ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA' : undefined, // Configure server name indication validation for problematic sites ...(config.disableServerIdentityCheck ? { checkServerIdentity: () => undefined } : {}), // When false or undefined, Node.js uses default tls.checkServerIdentity }), }; // Add proxy configuration if provided if (config.proxy) { axiosConfig.httpAgent = this.createProxyAgent(config.proxy, false); axiosConfig.httpsAgent = this.createProxyAgent(config.proxy, true); } const response = await axios_1.default.get(url, axiosConfig); if (config.verbose) { this.logWithLevel('debug', `✅ Successfully fetched ${url} (${response.status} ${response.statusText})`); } return { data: response.data, headers: this.normalizeHeaders(response.headers), status: response.status, statusText: response.statusText, }; } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); const errorInfo = this.categorizeError(lastError); if (this.shouldLog('error')) { this.logWithLevel('error', `❌ Attempt ${attempt + 1} failed: ${errorInfo.type} - ${lastError.message}`); } // Check if we should retry based on error type const shouldRetry = this.shouldRetryOnError(errorInfo, config); if (config.verbose) { this.logWithLevel('debug', `🤔 Should retry: ${shouldRetry}, Attempts left: ${maxRetries - attempt}`); } // If this is not the last attempt and we should retry this error type if (attempt < maxRetries && shouldRetry) { if (config.verbose) { this.logWithLevel('debug', `⏳ Waiting ${retryDelay}ms before retry...`); } await this.delay(retryDelay); continue; } // If we shouldn't retry this error type, break early if (!shouldRetry) { if (config.verbose) { this.logWithLevel('debug', `🚫 Not retrying ${errorInfo.type} error`); } break; } } } // Enhanced error message with categorized error info const errorInfo = lastError ? this.categorizeError(lastError) : { type: 'unknown', description: 'Unknown error' }; throw new Error(`Failed to fetch HTML from ${url} after ${maxRetries + 1} attempts. ` + `Error type: ${errorInfo.type}. ${errorInfo.description}. ` + `Last error: ${lastError?.message || 'Unknown error'}`); } /** * Create a proxy agent based on proxy configuration */ createProxyAgent(proxy, isHttps) { if (!proxy.url || proxy.url.trim() === '') { throw new Error('Proxy URL cannot be empty'); } let proxyUrl = proxy.url; try { const url = new URL(proxy.url); // If separate username/password are provided, they take precedence if (proxy.username && proxy.password) { url.username = proxy.username; url.password = proxy.password; } // If URL already contains credentials and no separate creds provided, keep them // (URL constructor automatically parses user:pass@host format) proxyUrl = url.toString(); } catch (error) { // If URL parsing fails, try to construct a basic URL // This handles cases where the URL might be in a non-standard format if (proxy.username && proxy.password) { // Try to add credentials to potentially malformed URL const hasProtocol = proxy.url.includes('://'); if (hasProtocol) { const [protocol, rest] = proxy.url.split('://'); proxyUrl = `${protocol}://${proxy.username}:${proxy.password}@${rest}`; } else { // Assume http if no protocol specified proxyUrl = `http://${proxy.username}:${proxy.password}@${proxy.url}`; } } } // Determine proxy type from URL if not specified const proxyType = proxy.type || this.detectProxyType(proxy.url); switch (proxyType) { case 'socks4': case 'socks5': return new socks_proxy_agent_1.SocksProxyAgent(proxyUrl); case 'http': case 'https': default: return new https_proxy_agent_1.HttpsProxyAgent(proxyUrl); } } /** * Detect proxy type from URL */ detectProxyType(url) { const protocol = url.split('://')[0].toLowerCase(); switch (protocol) { case 'socks4': case 'socks5': return protocol; case 'http': case 'https': return protocol; default: return 'http'; } } /** * Delay function for retries */ delay(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } /** * Generate a random user agent string * * Returns a realistic user agent string selected randomly from a pool * of current browser user agents. Useful for avoiding detection when * scraping websites that block requests with default user agents. * Falls back to a default user agent if the random generation fails. * * @returns Promise resolving to a random user agent string * * @example * ```typescript * // Get a random user agent * const userAgent = await parser.getRandomUserAgent(); * console.log(userAgent); * // Result: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36..." * * // Use with fetchHtml for better stealth * const response = await parser.fetchHtml('https://example.com', { * useRandomUserAgent: true // This uses getRandomUserAgent() internally * }); * * // Or manually * const customUserAgent = await parser.getRandomUserAgent(); * const response = await parser.fetchHtml('https://example.com', { * userAgent: customUserAgent * }); * ``` */ async getRandomUserAgent() { try { const { randUA } = await Promise.resolve().then(() => require('@ahmedrangel/rand-user-agent')); return randUA(); } catch (error) { // Fallback to default user agent if dynamic import fails return (this.defaultOptions.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); } } /** * Test proxy connection and authentication * * Validates that a proxy configuration is working by attempting * to fetch a test URL through the proxy. Useful for verifying * proxy credentials and connectivity before using it for actual * HTML parsing operations. * * @param proxy - Proxy configuration to test * @param testUrl - URL to use for testing proxy connection (default: 'https://httpbin.org/ip') * * @returns Promise resolving to true if proxy works, false otherwise * * @example * ```typescript * const proxyConfig = { * url: 'http://proxy.example.com:8080', * username: 'user', * password: 'pass' * }; * * const isWorking = await parser.testProxy(proxyConfig); * if (isWorking) { * console.log('Proxy is working!'); * } else { * console.log('Proxy failed or authentication invalid'); * } * * // Test with custom URL * const isWorking = await parser.testProxy(proxyConfig, 'https://example.com'); * ``` */ async testProxy(proxy, testUrl = 'https://httpbin.org/ip') { try { await this.fetchHtml(testUrl, { proxy, timeout: 5000, retries: 0, }); return true; } catch (error) { return false; } } /** * Extract a single value from HTML using XPath or CSS selectors * * Extracts the first matching element's text content or attribute value. * Supports both XPath expressions (recommended) and CSS selectors. * Can extract specific attributes from elements or their text content. * * @param html - HTML content to parse * @param selector - XPath expression or CSS selector to locate the element * @param type - Type of selector: 'xpath' (default) or 'css' * @param attribute - HTML attribute to extract (optional, extracts text content if not specified) * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns The extracted text/attribute value, or null if no match found * * @example * ```typescript * const html = '<div><h1 id="title">Welcome</h1><a href="/home">Home</a></div>'; * * // Extract text content using XPath * const title = parser.extractSingle(html, '//h1[@id="title"]/text()'); * // Result: "Welcome" * * // Extract attribute using XPath * const link = parser.extractSingle(html, '//a', 'xpath', 'href'); * // Result: "/home" * * // Extract using CSS selector * const titleCSS = parser.extractSingle(html, 'h1#title', 'css'); * // Result: "Welcome" * * // With type safety and transformation * const id = parser.extractSingle<number>(html, '//div/@data-id', 'xpath', undefined, { * transform: (value: string) => parseInt(value) * }); * // Result: number | null * * // With verbose logging * const result = parser.extractSingle(html, '//h1/text()', 'xpath', undefined, { verbose: true }); * ``` */ extractSingle(html, selector, type = 'xpath', attribute, options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; if (verbose) { this.logWithLevel('debug', `🔍 extractSingle - Selector: "${selector}", Type: ${type}, Attribute: ${attribute || 'none'}`); } try { let result; if (type === 'xpath') { result = this.extractSingleXPath(html, selector, attribute, verbose); } else { result = this.extractSingleCSS(html, selector, attribute); } if (verbose) { this.logWithLevel('debug', `✅ extractSingle result: ${result ? `"${result.substring(0, 100)}${result.length > 100 ? '...' : ''}"` : 'null'}`); } // Apply transformation if provided and result exists if (result !== null && options?.transform) { return this.applyTransform(result, options.transform, options.baseUrl); } // Return as T if no transform (assumes T extends string when no transform) return result; } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', '❌ Error in extractSingle:', error); } return null; } } /** * Extract multiple values from HTML using XPath or CSS selectors * * Extracts text content or attribute values from all matching elements. * Returns an array of strings containing all found values. Supports * both XPath expressions and CSS selectors with attribute extraction. * * @param html - HTML content to parse * @param selector - XPath expression or CSS selector to locate elements * @param type - Type of selector: 'xpath' (default) or 'css' * @param attribute - HTML attribute to extract (optional, extracts text content if not specified) * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns Array of extracted text/attribute values (empty array if no matches) * * @example * ```typescript * const html = ` * <ul> * <li><a href="/page1">Page 1</a></li> * <li><a href="/page2">Page 2</a></li> * <li><a href="/page3">Page 3</a></li> * </ul> * `; * * // Extract all link texts using XPath * const linkTexts = parser.extractMultiple(html, '//a/text()'); * // Result: ["Page 1", "Page 2", "Page 3"] * * // Extract all href attributes using XPath * const hrefs = parser.extractMultiple(html, '//a', 'xpath', 'href'); * // Result: ["/page1", "/page2", "/page3"] * * // Extract using CSS selector * const linksCSS = parser.extractMultiple(html, 'li a', 'css'); * // Result: ["Page 1", "Page 2", "Page 3"] * * // With type safety and transformation * const ids = parser.extractMultiple<number>(html, '//li/@data-id', 'xpath', undefined, { * transform: (value: string) => parseInt(value) * }); * // Result: number[] * * // With verbose logging * const results = parser.extractMultiple(html, '//li', 'xpath', undefined, { verbose: true }); * ``` */ extractMultiple(html, selector, type = 'xpath', attribute, options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; if (verbose) { this.logWithLevel('debug', `🔍 extractMultiple - Selector: "${selector}", Type: ${type}, Attribute: ${attribute || 'none'}`); } try { let results; if (type === 'xpath') { results = this.extractMultipleXPath(html, selector, attribute, verbose); } else { results = this.extractMultipleCSS(html, selector, attribute); } if (verbose) { this.logWithLevel('debug', `✅ extractMultiple found ${results.length} results`); if (results.length > 0 && results.length <= 5) { results.forEach((result, index) => { this.logWithLevel('debug', ` ${index + 1}: "${result.substring(0, 80)}${result.length > 80 ? '...' : ''}"`); }); } else if (results.length > 5) { this.logWithLevel('debug', ` First 3 results:`); results.slice(0, 3).forEach((result, index) => { this.logWithLevel('debug', ` ${index + 1}: "${result.substring(0, 80)}${result.length > 80 ? '...' : ''}"`); }); this.logWithLevel('debug', ` ... and ${results.length - 3} more`); } } // Apply transformation if provided if (options?.transform) { return this.applyTransform(results, options.transform, options.baseUrl); } // Return as T[] if no transform (assumes T extends string when no transform) return results; } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', '❌ Error in extractMultiple:', error); } return []; } } /** * Extract text content from HTML elements * * Convenience method specifically for extracting text content from elements. * This is equivalent to calling extractSingle without an attribute parameter. * Useful when you only need the text content and want clearer intent. * * @param html - HTML content to parse * @param selector - XPath expression or CSS selector to locate the element * @param type - Type of selector: 'xpath' (default) or 'css' * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns The extracted text content, or null if no match found * * @example * ```typescript * const html = '<div><h1>Main Title</h1><p>Description text</p></div>'; * * // Extract heading text * const title = parser.extractText(html, '//h1'); * // Result: "Main Title" * * // Extract paragraph text using CSS * const description = parser.extractText(html, 'p', 'css'); * // Result: "Description text" * * // With type safety and transformation * const wordCount = parser.extractText<number>(html, '//p', 'xpath', { * transform: (text: string) => text.split(' ').length * }); * // Result: number | null * * // With verbose logging * const text = parser.extractText(html, '//p/text()', 'xpath', { verbose: true }); * ``` */ extractText(html, selector, type = 'xpath', options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; try { let result; if (type === 'xpath') { result = this.extractSingleXPath(html, selector, undefined, verbose); } else { result = this.extractSingleCSS(html, selector); } // Apply transformation if provided and result exists if (result !== null && options?.transform) { return this.applyTransform(result, options.transform, options.baseUrl); } // Return as T if no transform (assumes T extends string when no transform) return result; } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', 'Error in extractText:', error); } return null; } } /** * Extract attribute values from multiple HTML elements * * Convenience method for extracting a specific attribute from all matching elements. * This is equivalent to calling extractMultiple with an attribute parameter. * Returns all attribute values from elements that match the selector. * * @param html - HTML content to parse * @param selector - XPath expression or CSS selector to locate elements * @param attribute - HTML attribute name to extract * @param type - Type of selector: 'xpath' (default) or 'css' * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns Array of attribute values (empty array if no matches or no attribute) * * @example * ```typescript * const html = ` * <nav> * <a href="/home" title="Home Page">Home</a> * <a href="/about" title="About Us">About</a> * <a href="/contact" title="Contact Form">Contact</a> * </nav> * `; * * // Extract all href attributes * const links = parser.extractAttributes(html, '//a', 'href'); * // Result: ["/home", "/about", "/contact"] * * // Extract all title attributes * const titles = parser.extractAttributes(html, '//a', 'title', 'xpath'); * // Result: ["Home Page", "About Us", "Contact Form"] * * // Using CSS selector * const hrefs = parser.extractAttributes(html, 'nav a', 'href', 'css'); * // Result: ["/home", "/about", "/contact"] * * // With type safety and transformation * const ids = parser.extractAttributes<number>(html, '//img', 'data-id', 'xpath', { * transform: (value: string) => parseInt(value) * }); * // Result: number[] * ``` */ extractAttributes(html, selector, attribute, type = 'xpath', options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; try { let results; if (type === 'xpath') { results = this.extractMultipleXPath(html, selector, attribute, verbose); } else { results = this.extractMultipleCSS(html, selector, attribute); } // Apply transformation if provided if (options?.transform) { return this.applyTransform(results, options.transform, options.baseUrl); } // Return as T[] if no transform (assumes T extends string when no transform) return results; } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', 'Error in extractAttributes:', error); } return []; } } /** * Check if elements exist in HTML content * * Tests whether the specified selector matches any elements in the HTML. * Useful for conditional logic based on element presence or for validating * HTML structure before attempting extractions. * * @param html - HTML content to search * @param selector - XPath expression or CSS selector to test * @param type - Type of selector: 'xpath' (default) or 'css' * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns true if at least one element matches, false otherwise * * @example * ```typescript * const html = '<div><h1>Title</h1><p class="content">Text</p></div>'; * * // Check if title exists * const hasTitle = parser.exists(html, '//h1'); * // Result: true * * // Check if specific class exists * const hasContent = parser.exists(html, '//p[@class="content"]'); * // Result: true * * // Check for non-existent element * const hasFooter = parser.exists(html, '//footer'); * // Result: false * * // Using CSS selector * const hasContentCSS = parser.exists(html, 'p.content', 'css'); * // Result: true * * // Conditional extraction based on existence * if (parser.exists(html, '//nav')) { * const navigation = parser.extractStructured(html, navSchema); * } * ``` */ exists(html, selector, type = 'xpath', options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; if (verbose && this.shouldLog('debug')) { this.logWithLevel('debug', `🔍 exists - Checking selector: "${selector}", Type: ${type}`); } try { let result; if (type === 'xpath') { const results = this.evaluateXPath(html, selector, verbose); result = results.length > 0; } else { const $ = cheerio.load(html); result = $(selector).length > 0; } if (verbose && this.shouldLog('debug')) { this.logWithLevel('debug', `✅ exists result: ${result ? 'Found' : 'Not found'}`); } return result; } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', '❌ Error in exists:', error); } return false; } } /** * Count the number of matching elements in HTML content * * Returns the total number of elements that match the specified selector. * Useful for pagination, validation, or determining the size of data sets * before processing them. * * @param html - HTML content to search * @param selector - XPath expression or CSS selector to count * @param type - Type of selector: 'xpath' (default) or 'css' * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns Number of matching elements (0 if no matches) * * @example * ```typescript * const html = ` * <ul> * <li>Item 1</li> * <li>Item 2</li> * <li>Item 3</li> * </ul> * <div class="highlight">Special</div> * `; * * // Count list items * const itemCount = parser.count(html, '//li'); * // Result: 3 * * // Count elements with specific class * const highlightCount = parser.count(html, '//div[@class="highlight"]'); * // Result: 1 * * // Count using CSS selector * const listItemsCSS = parser.count(html, 'ul li', 'css'); * // Result: 3 * * // Use count for conditional processing * const articleCount = parser.count(html, '//article'); * if (articleCount > 10) { * console.log('Large dataset detected, processing in batches'); * } * ``` */ count(html, selector, type = 'xpath', options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; if (verbose && this.shouldLog('debug')) { this.logWithLevel('debug', `🔍 count - Counting selector: "${selector}", Type: ${type}`); } try { let result; if (type === 'xpath') { const results = this.evaluateXPath(html, selector, verbose); result = results.length; } else { const $ = cheerio.load(html); result = $(selector).length; } if (verbose && this.shouldLog('debug')) { this.logWithLevel('debug', `✅ count result: ${result} elements found`); } return result; } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', '❌ Error in count:', error); } return 0; } } /** * Extract structured data from HTML using a schema definition * * Applies a schema object to extract multiple related fields from HTML content. * Each field in the schema defines its own selector, type, optional attribute, * transformation function, and now supports 'multiple' and 'raw' flags for array and raw HTML extraction. * * If a schema field includes `multiple: true`, the extracted value for that field * will be an array of results (with transform applied to each item if provided). * If a schema field includes `raw: true`, the extracted value will be the raw HTML of the matched element(s). * Otherwise, a single value is returned as before. * * @param html - HTML content to parse * @param schema - Schema object defining fields to extract (see ExtractionSchema) * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns Object with extracted data matching the schema structure. Fields with * `multiple: true` will be arrays, `raw: true` will be raw HTML, others will be single values. * * @example * ```typescript * const html = ` * <article> * <h1>Product Name</h1> * <span class="price">$29.99</span> * <img src="/image.jpg" alt="Product"> * <div class="rating" data-stars="4">★★★★☆</div> * <div class="tags"> * <span class="tag">electronics</span> * <span class="tag">gadget</span> * </div> * </article> * `; * * // Define typed interface * interface Product { * title: string; * price: number; * image: string; * rating: number; * tags: string[]; // <-- array field * titleHtml: string; // <-- raw HTML field * } * * const productSchema: ExtractionSchema<Product> = { * title: { * selector: '//h1/text()', * type: 'xpath' * }, * price: { * selector: '//span[@class="price"]/text()', * type: 'xpath', * transform: (price) => parseFloat(price.replace('$', '')) * }, * image: { * selector: '//img', * type: 'xpath', * attribute: 'src' * }, * rating: { * selector: '//div[@class="rating"]', * type: 'xpath', * attribute: 'data-stars', * transform: (stars) => parseInt(stars) * }, * tags: { * selector: '//span[@class="tag"]/text()', * type: 'xpath', * multiple: true // <-- NEW: array extraction * }, * titleHtml: { * selector: '//h1', * type: 'xpath', * raw: true // <-- NEW: raw HTML extraction * } * }; * * const product = parser.extractStructured<Product>(html, productSchema); * // Result: Product type with full type safety * // { * // title: "Product Name", * // price: 29.99, * // image: "/image.jpg", * // rating: 4, * // tags: ["electronics", "gadget"], // <-- array result * // titleHtml: "<h1>Product Name</h1>" // <-- raw HTML result * // } * ``` */ extractStructured(html, schema, options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; const baseUrl = options?.baseUrl; const result = {}; if (verbose) { this.logWithLevel('debug', `🔍 extractStructured - Processing ${Object.keys(schema).length} schema fields`); } try { for (const [key, config] of Object.entries(schema)) { try { let value; const raw = config.raw === true; if (config.multiple) { // Use extractMultiple for array fields if (config.type === 'xpath') { value = this.extractMultipleXPath(html, config.selector, config.attribute, verbose, raw); } else { if (raw) { value = this.extractMultipleCSS(html, config.selector, undefined, true); } else { value = this.extractMultipleCSS(html, config.selector, config.attribute); } } // Apply transformation if provided const transform = config.transform; if (transform) { value = this.applyTransform(value, transform, baseUrl); } } else { // Use extractSingle for single value fields if (config.type === 'xpath') { value = this.extractSingleXPath(html, config.selector, config.attribute, verbose, raw); } else { if (raw) { value = this.extractSingleCSS(html, config.selector, undefined, true); } else { value = this.extractSingleCSS(html, config.selector, config.attribute); } } // Apply transformation if provided if (value && config.transform) { value = this.applyTransform(value, config.transform, baseUrl); } } result[key] = value; if (verbose) { this.logWithLevel('debug', `✅ Extracted field '${key}': ${value ? 'success' : 'null/empty'}`); } } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', `❌ Error extracting field '${key}':`, error); } result[key] = config.multiple ? [] : null; } } } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', '❌ Error in extractStructured:', error); } } return result; } /** * Extract structured data directly from a container node (preserves XPath context) * * This method works directly with DOM nodes to preserve the XPath evaluation context, * allowing relative XPath expressions like './td[2]//a/@href' to work correctly. * * @param containerNode - The DOM node to extract data from * @param schema - Schema object defining fields to extract * @param options - Parsing options */ extractStructuredFromNode(containerNode, schema, options) { const verbose = options?.verbose ?? this.defaultOptions.verbose ?? false; const baseUrl = options?.baseUrl; const result = {}; if (verbose) { this.logWithLevel('debug', `🔍 extractStructuredFromNode - Processing ${Object.keys(schema).length} schema fields`); } try { for (const [key, config] of Object.entries(schema)) { try { let value; const raw = config.raw === true; if (config.multiple) { // Use extractMultiple for array fields if (config.type === 'xpath') { value = this.extractMultipleXPathFromNode(containerNode, config.selector, config.attribute, verbose, raw); } else { // CSS selectors need to work with HTML string const containerHTML = this.getElementHTML(containerNode); if (raw) { value = this.extractMultipleCSS(containerHTML, config.selector, undefined, true); } else { value = this.extractMultipleCSS(containerHTML, config.selector, config.attribute); } } // Apply transformation if provided const transform = config.transform; if (transform) { value = this.applyTransform(value, transform, baseUrl); } } else { // Use extractSingle for single value fields if (config.type === 'xpath') { value = this.extractSingleXPathFromNode(containerNode, config.selector, config.attribute, verbose, raw); } else { // CSS selectors need to work with HTML string const containerHTML = this.getElementHTML(containerNode); if (raw) { value = this.extractSingleCSS(containerHTML, config.selector, undefined, true); } else { value = this.extractSingleCSS(containerHTML, config.selector, config.attribute); } } // Apply transformation if provided if (value && config.transform) { value = this.applyTransform(value, config.transform, baseUrl); } } result[key] = value; if (verbose) { this.logWithLevel('debug', `✅ Extracted field '${key}': ${value ? 'success' : 'null/empty'}`); } } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', `❌ Error extracting field '${key}':`, error); } result[key] = config.multiple ? [] : null; } } } catch (error) { if (this.shouldLog('error')) { this.logWithLevel('error', '❌ Error in extractStructuredFromNode:', error); } } return result; } /** * Extract array of structured data from repeating HTML elements * * For each container, applies the schema as in extractStructured. If a schema field * includes `multiple: true`, the extracted value for that field will be an array of results. * If a schema field includes `raw: true`, the extracted value will be the raw HTML of the matched element(s). * * @param html - HTML content to parse * @param containerSelector - XPath or CSS selector to find container elements * @param schema - Schema object defining fields to extract from each container * @param containerType - Type of container selector: 'xpath' (default) or 'css' * @param options - Parsing options * @param options.verbose - Enable verbose logging for debugging * * @returns Array of objects with extracted data matching the schema structure. Fields with * `multiple: true` will be arrays, `raw: true` will be raw HTML, others will be single values. * * @example * ```typescript * const html = ` * <div class="products"> * <div class="product"> * <h3>Product A</h3> * <span class="price">$19.99</span> * <span class="tag">electronics</span> * <span class="tag">gadget</span> * </div> * <div class="product"> * <h3>Product B</h3> * <span class="price">$29.99</span> * <span class="tag">accessory</span> * </div> * </div> * `; * * // Define typed interface * interface Product { * name: string; * price: number; * tags: string[]; // <-- array field * nameHtml: string; // <-- raw HTML field * } * * const productSchema: ExtractionSchema<Product> = { * name: { * selector: './/h3/text()', * type: 'xpath' * }, * price: { * selector: './/span[@class="price"]/text()', * type: 'xpath', * transform: (value) => parseFloat(value.replace('$', '')) * }, * tags: { * selector: './/span[@class="tag"]/text()', * type: 'xpath', * multiple: true // <-- NEW: array extraction * }, *