UNPKG

@hanivanrizky/nestjs-html-parser

Version:

A powerful NestJS HTML parsing service with XPath and CSS selector support, proxy configuration, random user agents, and rich response metadata including headers and status codes

github.com/Hanivan/nestjs-html-parser

Hanivan/nestjs-html-parser

591 lines • 23.4 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ThreadIdExtractorPipe = exports.RegexReplacePipe = exports.RegexExtractionPipe = exports.QueryRemoverPipe = exports.ParseAsUrlPipe = exports.NumNormalizePipe = exports.mappingTransform = exports.DateFormatPipe = void 0; exports.scrapeBmwSgForum = scrapeBmwSgForum; const __1 = require("../"); /** * BMW-SG Forum Scraper Demo * * This example demonstrates a full forum crawling process based on the real BMW-SG forum payload. * It loads HTML from https://www.bmw-sg.com/forums/forums/introduction-greetings.24/page-5 * and applies extraction schemas matching the inspiration payload structure. */ const payload = { data: { endPoint: 'https://www.bmw-sg.com/forums/forums/introduction-greetings.24/page-5', patternReply: [ { key: 'container', pattern: "//article[contains(@class,'message--post')]", returnType: 'text', meta: { isContainer: 'true' }, }, { key: 'author', pattern: './@data-author', returnType: 'text' }, { key: 'replyDate', pattern: './/time[@datetime]/@datetime', returnType: 'text', pipe: [ { locale: 'en', type: 'date-format', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'asia/singapore', }, ], }, { key: 'replyId', pattern: './@data-content', returnType: 'text', pipe: [ { type: 'regex-replace', regex: '^post-', textReplacement: '', flag: 'g', }, ], }, { key: 'replyHtml', pattern: ".//article[contains(@class,'message-body')]", returnType: 'html', }, { key: 'replyText', pattern: ".//article[contains(@class,'message-body')]/descendant-or-self::node()/text()[normalize-space()]", returnType: 'text', meta: { multiple: 'true' }, }, { key: 'nextPage', pattern: "//ul[contains(@class,'pageNav-main')]/li/a[@href and not(@href='#')]", returnType: 'text', meta: { isPage: 'true' }, }, { key: 'urlReply', pattern: ".//div[contains(@class,'message-attribution-opposite')]//a/@href", returnType: 'text', pipe: [{ type: 'parse-as-url' }], }, ], interval: 45, priority: 'secondary', page: 5, patternPost: [ { key: 'THREAD_ID_PATTERN', pattern: './@class', scope: 'thread', pipe: [{ type: 'regex-extraction', regex: '\\w*$', flag: 'g' }], }, { key: 'THREAD_NODE_PATTERN', pattern: "//div[contains(@class, 'structItem--thread')]", scope: 'index-page', meta: { isThreadNode: 'true' }, }, { key: 'THREAD_TITLE_PATTERN', pattern: ".//div[@class='structItem-title']/a/text()[normalize-space()]", scope: 'thread', }, { key: 'THREAD_LINK_PATTERN', pattern: ".//div[@class='structItem-title']/a/@href", scope: 'thread', pipe: [{ type: 'query-remover', removed: ['s'] }], }, { key: 'THREAD_REPLIES_PATTERN', pattern: './div[3]/dl[1]/dd/text()', scope: 'thread', pipe: [{ type: 'num-normalize' }], }, { key: 'THREAD_VIEWS_PATTERN', pattern: './div[3]/dl[2]/dd/text()', scope: 'thread', pipe: [{ type: 'num-normalize' }], }, { key: 'THREAD_LAST_POST_PATTERN', pattern: './div[4]//time/@datetime', scope: 'thread', pipe: [ { locale: 'en', type: 'date-format', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'Asia/Singapore', }, ], }, { key: 'PAGINATION_NODE_PATTERN', pattern: "(//div[contains(@class, 'pageNav')])[1]/ul/li", scope: 'index-page', meta: { isPaginateNode: 'true', pagePatterns: { url: './a/@href', text: './a/text()' }, }, }, { key: 'SECTION_TITLE', pattern: '//head/title/text()', scope: 'index-page', }, ], maxItterateTRPage: 10, subForumId: 433, maxItteratePage: 10, langCode: 'en', countryCode: 'sg', numItterate: 4, numRetry: 0, origin: 'bmw-sg.com', mediaId: 902, timeout: 45, engine: 'html', timecheck: 1757488584, }, }; // Pipe classes following inspiration structure class ParseAsUrlPipe { type = 'parse-as-url'; baseUrl; transform(url) { if (!this.baseUrl) { throw new Error('BaseURL is required for ParseAsUrlPipe'); } try { return new URL(url, this.baseUrl).toString(); } catch (error) { throw new Error(`Invalid URL: ${url} with baseUrl: ${this.baseUrl}`); } } } exports.ParseAsUrlPipe = ParseAsUrlPipe; class RegexReplacePipe { type = 'regex-replace'; baseUrl; regex = ''; textReplacement = ''; flag = 'g'; transform(val) { if (typeof val === 'string') { const flag = Array.isArray(this.flag) ? this.flag.join(',') : this.flag; const result = val.replace(new RegExp(this.regex, flag), this.textReplacement); return result; } else { return val; } } } exports.RegexReplacePipe = RegexReplacePipe; class RegexExtractionPipe { type = 'regex-extraction'; baseUrl; regex = ''; flag = 'g'; transform(val) { if (typeof val === 'string') { const flag = Array.isArray(this.flag) ? this.flag.join(',') : this.flag; const match = val.match(new RegExp(this.regex, flag)); return match ? match[0] : ''; } else { return val; } } } exports.RegexExtractionPipe = RegexExtractionPipe; class QueryRemoverPipe { type = 'query-remover'; baseUrl; removed = []; transform(url) { try { const urlObj = new URL(url); this.removed.forEach((param) => { urlObj.searchParams.delete(param); }); return urlObj.toString(); } catch (error) { return url; // Return original if not a valid URL } } } exports.QueryRemoverPipe = QueryRemoverPipe; class NumNormalizePipe { type = 'num-normalize'; baseUrl; transform(numString) { if (typeof numString !== 'string') { return numString; } const val = numString?.toLowerCase().replace(new RegExp(',', 'g'), '.'); let resVal = parseFloat(val); if (val.endsWith('k') || val.endsWith('rb')) { resVal *= 1000; } else if (val.endsWith('m')) { resVal *= 1000000; } if (isNaN(resVal)) { return 0; } return Math.round(resVal); } } exports.NumNormalizePipe = NumNormalizePipe; class DateFormatPipe { type = 'date-format'; baseUrl; locale = 'en'; format = 'YYYY-MM-DDTHH:mm:ssZZ'; timezone = 'Asia/Singapore'; transform(dateString) { // Simple date parser that converts to Unix timestamp if (typeof dateString !== 'string' || !dateString.trim()) { return Date.now() / 1000; } const date = new Date(dateString); return isNaN(date.getTime()) ? Date.now() / 1000 : date.getTime() / 1000; } } exports.DateFormatPipe = DateFormatPipe; class ThreadIdExtractorPipe { type = 'thread-id-extractor'; baseUrl; transform(url) { if (typeof url !== 'string') return url; const match = url.match(/\.(\d+)\//); return match ? match[1] : url; } } exports.ThreadIdExtractorPipe = ThreadIdExtractorPipe; // Mapping function to convert raw pipes to object-based transform format (DRY principle) const mappingTransform = (rawPipes, withItsPayload) => { if (!rawPipes || !Array.isArray(rawPipes)) return []; return rawPipes.map((pipe) => { // Handle different pipe types based on inspiration CleanerType enum switch (pipe.type) { case 'parse-as-url': return { class: ParseAsUrlPipe, }; case 'query-remover': return { class: QueryRemoverPipe, payload: { removed: pipe.removed || withItsPayload?.removed || [], }, }; case 'num-normalize': return { class: NumNormalizePipe, }; case 'date-format': return { class: DateFormatPipe, payload: { locale: pipe.locale || 'en', format: pipe.format || 'YYYY-MM-DDTHH:mm:ssZZ', timezone: pipe.timezone || 'Asia/Singapore', }, }; case 'regex-replace': return { class: RegexReplacePipe, payload: { regex: pipe.regex || '', textReplacement: pipe.textReplacement || '', flag: pipe.flag || 'g', }, }; case 'regex-extraction': return { class: RegexExtractionPipe, payload: { regex: pipe.regex || '', flag: pipe.flag || 'g', }, }; default: console.warn(`Unknown pipe type: ${pipe.type}`); // Return a no-op transform instead of null return { class: class NoOpPipe { transform(val) { return val; } }, }; } }); // Don't filter, always return valid transforms }; exports.mappingTransform = mappingTransform; async function scrapeBmwSgForum(verbose = false) { const parser = new __1.HtmlParserService(); console.log('🏎️ BMW-SG Forum Scraper Demo'); console.log('='.repeat(50)); // inspiration payload configuration const endPoint = payload.data.endPoint; const baseUrl = parser.getOrigin(endPoint); console.log(`🔗 Target URL: ${endPoint}`); console.log(`🏠 Base URL: ${baseUrl}`); console.log(); try { console.log('📥 Fetching HTML from BMW-SG forum...'); // Fetch the HTML content from the real BMW-SG forum const response = await parser.fetchHtml(endPoint, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', }, }); console.log(`✅ HTML fetched successfully (${response.data.length} characters)`); console.log(`📊 Response status: ${response.status}`); console.log(); // 1. Thread extraction based on inspiration patternPost console.log('📝 Thread Extraction (patternPost)'); console.log('-'.repeat(40)); // Build thread schema from payload patterns const getPatternByKey = (key) => payload.data.patternPost.find((p) => p.key === key); const threadSchema = { // THREAD_ID_PATTERN - Extract numeric ID from URL threadId: { selector: './/div[@class="structItem-title"]/a/@href', type: 'xpath', transform: [{ class: ThreadIdExtractorPipe }], }, // THREAD_TITLE_PATTERN threadTitle: { selector: getPatternByKey('THREAD_TITLE_PATTERN')?.pattern || './/div[@class="structItem-title"]/a/text()[normalize-space()]', type: 'xpath', }, // THREAD_LINK_PATTERN threadLink: { selector: getPatternByKey('THREAD_LINK_PATTERN')?.pattern || './/div[@class="structItem-title"]/a/@href', type: 'xpath', transform: [ { class: ParseAsUrlPipe }, // Convert to absolute URL ...mappingTransform(getPatternByKey('THREAD_LINK_PATTERN')?.pipe || [ { type: 'query-remover', removed: ['s'] }, ]), ], }, // THREAD_REPLIES_PATTERN - From structItem-cell--meta threadReplies: { selector: './/div[contains(@class,"structItem-cell--meta")]//dl[dt="Replies"]/dd/text()', type: 'xpath', transform: mappingTransform([{ type: 'num-normalize' }]), }, // THREAD_VIEWS_PATTERN - From structItem-cell--meta threadViews: { selector: './/div[contains(@class,"structItem-cell--meta")]//dl[dt="Views"]/dd/text()', type: 'xpath', transform: mappingTransform([{ type: 'num-normalize' }]), }, // THREAD_LAST_POST_PATTERN - From structItem-cell--latest (updated for real HTML structure) threadLastPost: { selector: './/div[contains(@class,"structItem-cell--latest")]//time/@datetime', type: 'xpath', transform: mappingTransform(getPatternByKey('THREAD_LAST_POST_PATTERN')?.pipe || [ { type: 'date-format', locale: 'en', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'Asia/Singapore', }, ]), }, }; // Extract threads using THREAD_NODE_PATTERN const threadNodePattern = payload.data.patternPost.find((p) => p.key === 'THREAD_NODE_PATTERN') ?.pattern || '//div[contains(@class, "structItem--thread")]'; const threads = parser.extractStructuredList(response.data, threadNodePattern, // THREAD_NODE_PATTERN threadSchema, 'xpath', { verbose, baseUrl, }); console.log(`✅ Extracted ${threads.length} threads:`); console.log(); threads.forEach((thread, index) => { console.log(`📄 Thread ${index + 1}:`); console.log(` ID: ${thread.threadId}`); console.log(` Title: ${thread.threadTitle}`); console.log(` Link: ${thread.threadLink}`); console.log(` Replies: ${thread.threadReplies}`); console.log(` Views: ${thread.threadViews}`); const timestamp = thread.threadLastPost && !isNaN(thread.threadLastPost) ? thread.threadLastPost : Date.now() / 1000; const lastPostDate = new Date(timestamp * 1000); console.log(` Last Post: ${lastPostDate.toISOString()}`); console.log(); }); // 2. Reply extraction based on inspiration patternReply console.log('💬 Reply Extraction (patternReply)'); console.log('-'.repeat(40)); // Build reply schema from payload patterns const getReplyPatternByKey = (key) => payload.data.patternReply.find((p) => p.key === key); const replySchema = { // Author author: { selector: getReplyPatternByKey('author')?.pattern || './@data-author', type: 'xpath', }, // Reply date replyDate: { selector: getReplyPatternByKey('replyDate')?.pattern || './/time[@datetime]/@datetime', type: 'xpath', transform: mappingTransform(getReplyPatternByKey('replyDate')?.pipe || [ { type: 'date-format', locale: 'en', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'Asia/Singapore', }, ]), }, // Reply ID replyId: { selector: getReplyPatternByKey('replyId')?.pattern || './@data-content', type: 'xpath', transform: mappingTransform(getReplyPatternByKey('replyId')?.pipe || [ { type: 'regex-replace', regex: '^post-', textReplacement: '', flag: 'g', }, ]), }, // Reply HTML replyHtml: { selector: getReplyPatternByKey('replyHtml')?.pattern || './/article[contains(@class,"message-body")]', type: 'xpath', raw: true, // Return HTML instead of text }, // Reply text (multiple text nodes) replyText: { selector: getReplyPatternByKey('replyText')?.pattern || './/article[contains(@class,"message-body")]/descendant-or-self::node()/text()[normalize-space()]', type: 'xpath', multiple: true, // Extract multiple text nodes }, // URL reply urlReply: { selector: getReplyPatternByKey('urlReply')?.pattern || './/div[contains(@class,"message-attribution-opposite")]//a/@href', type: 'xpath', transform: mappingTransform(getReplyPatternByKey('urlReply')?.pipe || [{ type: 'parse-as-url' }]), }, }; // Extract replies using container pattern from payload const replyContainerPattern = getReplyPatternByKey('container')?.pattern || '//article[contains(@class,"message--post")]'; const replies = parser.extractStructuredList(response.data, replyContainerPattern, // container pattern replySchema, 'xpath', { verbose, baseUrl, }); console.log(`✅ Extracted ${replies.length} replies:`); console.log(); replies.slice(0, 3).forEach((reply, index) => { // Show first 3 replies console.log(`💭 Reply ${index + 1}:`); console.log(` Author: ${reply.author}`); const replyTimestamp = reply.replyDate && !isNaN(reply.replyDate) ? reply.replyDate : Date.now() / 1000; const replyDate = new Date(replyTimestamp * 1000); console.log(` Date: ${replyDate.toISOString()}`); console.log(` ID: ${reply.replyId}`); console.log(` Text (first 100 chars): ${reply.replyText ? reply.replyText.join(' ').substring(0, 100) : 'N/A'}...`); console.log(` URL: ${reply.urlReply}`); console.log(); }); // 3. Pagination and section info console.log('📄 Page Information'); console.log('-'.repeat(40)); // Extract section title using payload pattern const sectionTitlePattern = getPatternByKey('SECTION_TITLE')?.pattern || '//head/title/text()'; const sectionTitle = parser.extractSingle(response.data, sectionTitlePattern, 'xpath'); console.log(`📋 Section Title: ${sectionTitle}`); // Extract pagination info using payload pattern const nextPagePattern = getReplyPatternByKey('nextPage')?.pattern || '//ul[contains(@class,"pageNav-main")]/li/a[@href and not(@href="#")]/@href'; const nextPageUrls = parser.extractMultiple(response.data, nextPagePattern, 'xpath', undefined, { baseUrl, transform: mappingTransform([{ type: 'parse-as-url' }]), }); console.log(`🔗 Available page URLs: ${nextPageUrls.length}`); nextPageUrls.slice(0, 5).forEach((url, index) => { console.log(` ${index + 1}. ${url}`); }); // 4. Summary statistics console.log(); console.log('📊 Scraping Summary'); console.log('-'.repeat(40)); console.log(`🎯 Endpoint: ${endPoint}`); console.log(`📝 Threads extracted: ${threads.length}`); console.log(`💬 Replies extracted: ${replies.length}`); console.log(`📄 Pagination URLs: ${nextPageUrls.length}`); console.log(`📋 Section: ${sectionTitle}`); console.log(`🏷️ Media ID: ${payload.data.mediaId}`); console.log(`🌐 Origin: ${payload.data.origin}`); console.log(`⏱️ Interval: ${payload.data.interval}s`); console.log(`📄 Page: ${payload.data.page}`); console.log(`🔧 Engine: ${payload.data.engine}`); // Show some pipe transformation examples console.log(); console.log('🔧 Live Pipe Transformations'); console.log('-'.repeat(40)); if (threads.length > 0) { const sampleThread = threads[0]; console.log('📝 Thread ID extraction:'); console.log(` Raw class: "${sampleThread.threadId}"`); console.log(` Regex pattern: "\\w*$" → Extract last word`); console.log(); } if (replies.length > 0) { const sampleReply = replies[0]; console.log('💭 Reply ID transformation:'); console.log(` Raw data-content: "post-${sampleReply.replyId}"`); console.log(` Regex replace: "^post-" → "" (remove prefix)`); console.log(` Result: "${sampleReply.replyId}"`); console.log(); } } catch (error) { console.error('❌ Error during BMW-SG forum scraping:', error); throw error; } } // Run the scraper if this file is executed directly if (require.main === module) { (async () => { try { await scrapeBmwSgForum(false); console.log('🎉 BMW-SG forum scraping completed successfully!'); } catch (error) { console.error('💥 Scraping failed:', error); process.exit(1); } })(); } //# sourceMappingURL=bmw-sg.com.js.map