UNPKG

@hanivanrizky/nestjs-html-parser

Version:

A powerful NestJS HTML parsing service with XPath and CSS selector support, proxy configuration, random user agents, and rich response metadata including headers and status codes

351 lines (340 loc) • 14.1 kB
"use strict"; /** * Example: Pagination validation and safe extraction * * This example demonstrates how to safely handle undefined selectors * and validate pagination patterns before extracting data using both * the new extractPagination method and manual extraction methods. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.compareExtractionMethods = compareExtractionMethods; exports.demonstrateEnhancedExtractPagination = demonstrateEnhancedExtractPagination; exports.demonstrateNewExtractPagination = demonstrateNewExtractPagination; exports.demonstrateOldMethodValidation = demonstrateOldMethodValidation; exports.demonstrateUndefinedSelectorError = demonstrateUndefinedSelectorError; const html_parser_service_1 = require("../html-parser.service"); async function demonstrateNewExtractPagination() { const parser = new html_parser_service_1.HtmlParserService(); const html = ` <div class="pageNav pageNav--skipStart pageNav--skipEnd"> <a href="/forums/forums/introduction-greetings.24/page-4" class="pageNav-jump pageNav-jump--prev" >Prev</a > <ul class="pageNav-main"> <li class="pageNav-page"> <a href="/forums/forums/introduction-greetings.24/">1</a> </li> <li class="pageNav-page pageNav-page--earlier"> <a href="/forums/forums/introduction-greetings.24/page-2">2</a> </li> <li class="pageNav-page pageNav-page--earlier"> <a href="/forums/forums/introduction-greetings.24/page-3">3</a> </li> <li class="pageNav-page pageNav-page--earlier"> <a href="/forums/forums/introduction-greetings.24/page-4">4</a> </li> <li class="pageNav-page pageNav-page--current"> <a href="/forums/forums/introduction-greetings.24/page-5">5</a> </li> <li class="pageNav-page pageNav-page--later"> <a href="/forums/forums/introduction-greetings.24/page-6">6</a> </li> <li class="pageNav-page pageNav-page--later"> <a href="/forums/forums/introduction-greetings.24/page-7">7</a> </li> <li class="pageNav-page pageNav-page--skip pageNav-page--skipEnd"> <a data-xf-init="tooltip" data-xf-click="menu" role="button" tabindex="0" aria-expanded="false" aria-haspopup="true" data-original-title="Go to page" aria-label="Go to page" id="js-XFUniqueId2" >…</a > <div class="menu menu--pageJump" data-menu="menu" aria-hidden="true"> <div class="menu-content"> <h4 class="menu-header">Go to page</h4> <div class="menu-row" data-xf-init="page-jump" data-page-url="/forums/forums/introduction-greetings.24/page-%page%" > <div class="inputGroup inputGroup--numbers"> <div class="inputGroup inputGroup--numbers inputNumber inputGroup--joined" data-xf-init="number-box" > <input type="number" pattern="\d*" class="input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage" value="8" min="1" max="344" step="1" required="required" data-menu-autofocus="true" /><button type="button" tabindex="-1" class="inputGroup-text inputNumber-button inputNumber-button--up js-up" data-dir="up" title="Increase" aria-label="Increase" ></button ><button type="button" tabindex="-1" class="inputGroup-text inputNumber-button inputNumber-button--down js-down" data-dir="down" title="Decrease" aria-label="Decrease" ></button> </div> <span class="inputGroup-text" ><button type="button" class="js-pageJumpGo button"> <span class="button-text">Go</span> </button></span > </div> </div> </div> </div> </li> <li class="pageNav-page"> <a href="/forums/forums/introduction-greetings.24/page-344">344</a> </li> </ul> <a href="/forums/forums/introduction-greetings.24/page-6" class="pageNav-jump pageNav-jump--next" >Next</a > </div> `; try { console.log('=== New extractPagination Method ==='); // Using the new extractPagination method - much simpler! // Use the entire pageNav container to extract ALL links (including Prev/Next) const pages = parser.extractPagination(html, '//div[contains(@class, "pageNav")]', 'xpath', { verbose: true, baseUrl: 'https://www.bmw-sg.com', }); console.log('šŸ“„ Extracted pages with new method:', pages); console.log(`āœ… Found ${pages.length} pagination pages`); // Show first few pages pages.slice(0, 3).forEach((page, index) => { console.log(` ${index + 1}. "${page.text}" -> ${page.href}`); }); return pages; } catch (error) { console.error('āŒ Extraction failed:', error); return []; } } async function demonstrateOldMethodValidation() { const parser = new html_parser_service_1.HtmlParserService(); const html = ` <ul class="pageNav-main"> <li class="pageNav-page"><a href="/page-1">1</a></li> <li class="pageNav-page pageNav-page--current"><a href="/page-2">2</a></li> <li class="pageNav-page"><a href="/page-3">3</a></li> </ul> `; // Example of potentially undefined selector pattern const mappedPattern = { PAGE_NODE_PATTERN: { selector: "//li[contains(@class, 'pageNav-page')]", }, }; // Schema for page extraction const pageSchema = { href: { selector: './/a', type: 'xpath', attribute: 'href', }, text: { selector: './/a/text()', type: 'xpath', }, isCurrent: { selector: '.', type: 'xpath', attribute: 'class', transform: (className) => className?.includes('pageNav-page--current') || false, }, isSkip: { selector: '.', type: 'xpath', attribute: 'class', transform: (className) => className?.includes('pageNav-page--skip') || false, }, }; try { console.log('\n=== Old Method with Validation (Manual Schema) ==='); // Safe extraction with validation const containerSelector = mappedPattern['PAGE_NODE_PATTERN']?.selector; if (!containerSelector) { console.log('āŒ No container selector found'); return []; } console.log('āœ… Container selector found:', containerSelector); const pages = parser.extractStructuredList(html, containerSelector, pageSchema, 'xpath', { verbose: true }); console.log('šŸ“„ Extracted pages with old method:', pages); return pages; } catch (error) { console.error('āŒ Extraction failed:', error); return []; } } // Example with undefined selector to show the error case async function demonstrateUndefinedSelectorError() { const parser = new html_parser_service_1.HtmlParserService(); const html = '<div>Test</div>'; const mappedPattern = {}; // No PAGE_NODE_PATTERN try { console.log('\n=== Unsafe Extraction (will show error handling) ==='); // This will cause undefined selector error with old method const containerSelector = mappedPattern['PAGE_NODE_PATTERN']?.selector; console.log('Container selector:', containerSelector); // undefined if (!containerSelector) { console.log('āŒ No container selector found - returning empty array'); return []; } // This code won't be reached due to validation above const pages = parser.extractStructuredList(html, containerSelector, {}, 'xpath'); return pages; } catch (error) { console.error('āŒ Error caught:', error.message); return []; } } // Demonstrate enhanced extractPagination with all links extraction async function demonstrateEnhancedExtractPagination() { const parser = new html_parser_service_1.HtmlParserService(); const html = ` <div class="pageNav pageNav--skipStart pageNav--skipEnd"> <a href="/forums/forums/introduction-greetings.24/page-4" class="pageNav-jump pageNav-jump--prev">Prev</a> <ul class="pageNav-main"> <li class="pageNav-page"><a href="/forums/forums/introduction-greetings.24/">1</a></li> <li class="pageNav-page pageNav-page--earlier"><a href="/forums/forums/introduction-greetings.24/page-2">2</a></li> <li class="pageNav-page pageNav-page--earlier"><a href="/forums/forums/introduction-greetings.24/page-3">3</a></li> <li class="pageNav-page pageNav-page--earlier"><a href="/forums/forums/introduction-greetings.24/page-4">4</a></li> <li class="pageNav-page pageNav-page--current"><a href="/forums/forums/introduction-greetings.24/page-5">5</a></li> <li class="pageNav-page pageNav-page--later"><a href="/forums/forums/introduction-greetings.24/page-6">6</a></li> <li class="pageNav-page pageNav-page--later"><a href="/forums/forums/introduction-greetings.24/page-7">7</a></li> <li class="pageNav-page"><a href="/forums/forums/introduction-greetings.24/page-344">344</a></li> </ul> <a href="/forums/forums/introduction-greetings.24/page-6" class="pageNav-jump pageNav-jump--next">Next</a> </div> `; try { console.log('\n=== Enhanced extractPagination Method - All Links ==='); // Extract ALL links from the entire pagination container const allPages = parser.extractPagination(html, '//div[contains(@class, "pageNav")]', 'xpath', { verbose: true, baseUrl: 'https://www.bmw-sg.com', }); console.log('šŸ“„ All pagination links extracted:', allPages); console.log(`āœ… Found ${allPages.length} total pagination links`); // Show all links allPages.forEach((page, index) => { console.log(` ${index + 1}. "${page.text}" -> ${page.href}`); }); console.log('\n=== Enhanced extractPagination Method - Custom Schema ==='); const detailedPages = parser.extractPagination(html, '//li[@class[contains(., "pageNav-page")]] | //a[@class[contains(., "pageNav-jump")]]', 'xpath', { verbose: true, baseUrl: 'https://www.bmw-sg.com', schema: { href: { selector: './/a | .', type: 'xpath', attribute: 'href', }, text: { selector: './/a/text() | ./text()', type: 'xpath', }, isActive: { selector: '.', type: 'xpath', attribute: 'class', transform: (className) => className?.includes('pageNav-page--current') || false, }, isPrevNext: { selector: '.', type: 'xpath', attribute: 'class', transform: (className) => className?.includes('pageNav-jump') || false, }, className: { selector: '.', type: 'xpath', attribute: 'class', }, }, }); console.log('šŸ“„ Detailed pagination info:', detailedPages); console.log(`āœ… Found ${detailedPages.length} detailed pagination elements`); return allPages; } catch (error) { console.error('āŒ Enhanced extraction failed:', error); return []; } } // Compare both methods async function compareExtractionMethods() { console.log('\n=== Comparison: New vs Old Method ==='); const html = ` <ul class="pageNav-main"> <li class="pageNav-page"><a href="/page-1">Page 1</a></li> <li class="pageNav-page"><a href="/page-2">Page 2</a></li> <li class="pageNav-page"><a href="/page-3">Page 3</a></li> </ul> `; const parser = new html_parser_service_1.HtmlParserService(); const baseUrl = 'https://example.com'; console.log('šŸ“Š New extractPagination method:'); const newMethodPages = parser.extractPagination(html, "//li[contains(@class, 'pageNav-page')]", 'xpath', { baseUrl }); console.log(' Result count:', newMethodPages.length); console.log(' Sample:', newMethodPages[0]); console.log('\nšŸ“Š Old extractStructuredList method:'); const pageSchema = { href: { selector: './/a', type: 'xpath', attribute: 'href', }, text: { selector: './/a/text()', type: 'xpath', }, }; const oldMethodPages = parser.extractStructuredList(html, "//li[contains(@class, 'pageNav-page')]", pageSchema, 'xpath', { baseUrl }); console.log(' Result count:', oldMethodPages.length); console.log(' Sample:', oldMethodPages[0]); console.log('\nāœ… Both methods produce equivalent results!'); console.log('šŸ’” New method is more convenient for common pagination use cases'); } // Run examples if (require.main === module) { (async () => { await demonstrateNewExtractPagination(); await demonstrateEnhancedExtractPagination(); await demonstrateOldMethodValidation(); await demonstrateUndefinedSelectorError(); await compareExtractionMethods(); })(); } //# sourceMappingURL=pagination-validation.js.map