UNPKG

@hanivanrizky/nestjs-html-parser

Version:

A powerful NestJS HTML parsing service with XPath and CSS selector support, proxy configuration, random user agents, and rich response metadata including headers and status codes

591 lines 23.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ThreadIdExtractorPipe = exports.RegexReplacePipe = exports.RegexExtractionPipe = exports.QueryRemoverPipe = exports.ParseAsUrlPipe = exports.NumNormalizePipe = exports.mappingTransform = exports.DateFormatPipe = void 0; exports.scrapeBmwSgForum = scrapeBmwSgForum; const __1 = require("../"); /** * BMW-SG Forum Scraper Demo * * This example demonstrates a full forum crawling process based on the real BMW-SG forum payload. * It loads HTML from https://www.bmw-sg.com/forums/forums/introduction-greetings.24/page-5 * and applies extraction schemas matching the inspiration payload structure. */ const payload = { data: { endPoint: 'https://www.bmw-sg.com/forums/forums/introduction-greetings.24/page-5', patternReply: [ { key: 'container', pattern: "//article[contains(@class,'message--post')]", returnType: 'text', meta: { isContainer: 'true' }, }, { key: 'author', pattern: './@data-author', returnType: 'text' }, { key: 'replyDate', pattern: './/time[@datetime]/@datetime', returnType: 'text', pipe: [ { locale: 'en', type: 'date-format', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'asia/singapore', }, ], }, { key: 'replyId', pattern: './@data-content', returnType: 'text', pipe: [ { type: 'regex-replace', regex: '^post-', textReplacement: '', flag: 'g', }, ], }, { key: 'replyHtml', pattern: ".//article[contains(@class,'message-body')]", returnType: 'html', }, { key: 'replyText', pattern: ".//article[contains(@class,'message-body')]/descendant-or-self::node()/text()[normalize-space()]", returnType: 'text', meta: { multiple: 'true' }, }, { key: 'nextPage', pattern: "//ul[contains(@class,'pageNav-main')]/li/a[@href and not(@href='#')]", returnType: 'text', meta: { isPage: 'true' }, }, { key: 'urlReply', pattern: ".//div[contains(@class,'message-attribution-opposite')]//a/@href", returnType: 'text', pipe: [{ type: 'parse-as-url' }], }, ], interval: 45, priority: 'secondary', page: 5, patternPost: [ { key: 'THREAD_ID_PATTERN', pattern: './@class', scope: 'thread', pipe: [{ type: 'regex-extraction', regex: '\\w*$', flag: 'g' }], }, { key: 'THREAD_NODE_PATTERN', pattern: "//div[contains(@class, 'structItem--thread')]", scope: 'index-page', meta: { isThreadNode: 'true' }, }, { key: 'THREAD_TITLE_PATTERN', pattern: ".//div[@class='structItem-title']/a/text()[normalize-space()]", scope: 'thread', }, { key: 'THREAD_LINK_PATTERN', pattern: ".//div[@class='structItem-title']/a/@href", scope: 'thread', pipe: [{ type: 'query-remover', removed: ['s'] }], }, { key: 'THREAD_REPLIES_PATTERN', pattern: './div[3]/dl[1]/dd/text()', scope: 'thread', pipe: [{ type: 'num-normalize' }], }, { key: 'THREAD_VIEWS_PATTERN', pattern: './div[3]/dl[2]/dd/text()', scope: 'thread', pipe: [{ type: 'num-normalize' }], }, { key: 'THREAD_LAST_POST_PATTERN', pattern: './div[4]//time/@datetime', scope: 'thread', pipe: [ { locale: 'en', type: 'date-format', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'Asia/Singapore', }, ], }, { key: 'PAGINATION_NODE_PATTERN', pattern: "(//div[contains(@class, 'pageNav')])[1]/ul/li", scope: 'index-page', meta: { isPaginateNode: 'true', pagePatterns: { url: './a/@href', text: './a/text()' }, }, }, { key: 'SECTION_TITLE', pattern: '//head/title/text()', scope: 'index-page', }, ], maxItterateTRPage: 10, subForumId: 433, maxItteratePage: 10, langCode: 'en', countryCode: 'sg', numItterate: 4, numRetry: 0, origin: 'bmw-sg.com', mediaId: 902, timeout: 45, engine: 'html', timecheck: 1757488584, }, }; // Pipe classes following inspiration structure class ParseAsUrlPipe { type = 'parse-as-url'; baseUrl; transform(url) { if (!this.baseUrl) { throw new Error('BaseURL is required for ParseAsUrlPipe'); } try { return new URL(url, this.baseUrl).toString(); } catch (error) { throw new Error(`Invalid URL: ${url} with baseUrl: ${this.baseUrl}`); } } } exports.ParseAsUrlPipe = ParseAsUrlPipe; class RegexReplacePipe { type = 'regex-replace'; baseUrl; regex = ''; textReplacement = ''; flag = 'g'; transform(val) { if (typeof val === 'string') { const flag = Array.isArray(this.flag) ? this.flag.join(',') : this.flag; const result = val.replace(new RegExp(this.regex, flag), this.textReplacement); return result; } else { return val; } } } exports.RegexReplacePipe = RegexReplacePipe; class RegexExtractionPipe { type = 'regex-extraction'; baseUrl; regex = ''; flag = 'g'; transform(val) { if (typeof val === 'string') { const flag = Array.isArray(this.flag) ? this.flag.join(',') : this.flag; const match = val.match(new RegExp(this.regex, flag)); return match ? match[0] : ''; } else { return val; } } } exports.RegexExtractionPipe = RegexExtractionPipe; class QueryRemoverPipe { type = 'query-remover'; baseUrl; removed = []; transform(url) { try { const urlObj = new URL(url); this.removed.forEach((param) => { urlObj.searchParams.delete(param); }); return urlObj.toString(); } catch (error) { return url; // Return original if not a valid URL } } } exports.QueryRemoverPipe = QueryRemoverPipe; class NumNormalizePipe { type = 'num-normalize'; baseUrl; transform(numString) { if (typeof numString !== 'string') { return numString; } const val = numString?.toLowerCase().replace(new RegExp(',', 'g'), '.'); let resVal = parseFloat(val); if (val.endsWith('k') || val.endsWith('rb')) { resVal *= 1000; } else if (val.endsWith('m')) { resVal *= 1000000; } if (isNaN(resVal)) { return 0; } return Math.round(resVal); } } exports.NumNormalizePipe = NumNormalizePipe; class DateFormatPipe { type = 'date-format'; baseUrl; locale = 'en'; format = 'YYYY-MM-DDTHH:mm:ssZZ'; timezone = 'Asia/Singapore'; transform(dateString) { // Simple date parser that converts to Unix timestamp if (typeof dateString !== 'string' || !dateString.trim()) { return Date.now() / 1000; } const date = new Date(dateString); return isNaN(date.getTime()) ? Date.now() / 1000 : date.getTime() / 1000; } } exports.DateFormatPipe = DateFormatPipe; class ThreadIdExtractorPipe { type = 'thread-id-extractor'; baseUrl; transform(url) { if (typeof url !== 'string') return url; const match = url.match(/\.(\d+)\//); return match ? match[1] : url; } } exports.ThreadIdExtractorPipe = ThreadIdExtractorPipe; // Mapping function to convert raw pipes to object-based transform format (DRY principle) const mappingTransform = (rawPipes, withItsPayload) => { if (!rawPipes || !Array.isArray(rawPipes)) return []; return rawPipes.map((pipe) => { // Handle different pipe types based on inspiration CleanerType enum switch (pipe.type) { case 'parse-as-url': return { class: ParseAsUrlPipe, }; case 'query-remover': return { class: QueryRemoverPipe, payload: { removed: pipe.removed || withItsPayload?.removed || [], }, }; case 'num-normalize': return { class: NumNormalizePipe, }; case 'date-format': return { class: DateFormatPipe, payload: { locale: pipe.locale || 'en', format: pipe.format || 'YYYY-MM-DDTHH:mm:ssZZ', timezone: pipe.timezone || 'Asia/Singapore', }, }; case 'regex-replace': return { class: RegexReplacePipe, payload: { regex: pipe.regex || '', textReplacement: pipe.textReplacement || '', flag: pipe.flag || 'g', }, }; case 'regex-extraction': return { class: RegexExtractionPipe, payload: { regex: pipe.regex || '', flag: pipe.flag || 'g', }, }; default: console.warn(`Unknown pipe type: ${pipe.type}`); // Return a no-op transform instead of null return { class: class NoOpPipe { transform(val) { return val; } }, }; } }); // Don't filter, always return valid transforms }; exports.mappingTransform = mappingTransform; async function scrapeBmwSgForum(verbose = false) { const parser = new __1.HtmlParserService(); console.log('🏎️ BMW-SG Forum Scraper Demo'); console.log('='.repeat(50)); // inspiration payload configuration const endPoint = payload.data.endPoint; const baseUrl = parser.getOrigin(endPoint); console.log(`🔗 Target URL: ${endPoint}`); console.log(`🏠 Base URL: ${baseUrl}`); console.log(); try { console.log('📥 Fetching HTML from BMW-SG forum...'); // Fetch the HTML content from the real BMW-SG forum const response = await parser.fetchHtml(endPoint, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', }, }); console.log(`✅ HTML fetched successfully (${response.data.length} characters)`); console.log(`📊 Response status: ${response.status}`); console.log(); // 1. Thread extraction based on inspiration patternPost console.log('📝 Thread Extraction (patternPost)'); console.log('-'.repeat(40)); // Build thread schema from payload patterns const getPatternByKey = (key) => payload.data.patternPost.find((p) => p.key === key); const threadSchema = { // THREAD_ID_PATTERN - Extract numeric ID from URL threadId: { selector: './/div[@class="structItem-title"]/a/@href', type: 'xpath', transform: [{ class: ThreadIdExtractorPipe }], }, // THREAD_TITLE_PATTERN threadTitle: { selector: getPatternByKey('THREAD_TITLE_PATTERN')?.pattern || './/div[@class="structItem-title"]/a/text()[normalize-space()]', type: 'xpath', }, // THREAD_LINK_PATTERN threadLink: { selector: getPatternByKey('THREAD_LINK_PATTERN')?.pattern || './/div[@class="structItem-title"]/a/@href', type: 'xpath', transform: [ { class: ParseAsUrlPipe }, // Convert to absolute URL ...mappingTransform(getPatternByKey('THREAD_LINK_PATTERN')?.pipe || [ { type: 'query-remover', removed: ['s'] }, ]), ], }, // THREAD_REPLIES_PATTERN - From structItem-cell--meta threadReplies: { selector: './/div[contains(@class,"structItem-cell--meta")]//dl[dt="Replies"]/dd/text()', type: 'xpath', transform: mappingTransform([{ type: 'num-normalize' }]), }, // THREAD_VIEWS_PATTERN - From structItem-cell--meta threadViews: { selector: './/div[contains(@class,"structItem-cell--meta")]//dl[dt="Views"]/dd/text()', type: 'xpath', transform: mappingTransform([{ type: 'num-normalize' }]), }, // THREAD_LAST_POST_PATTERN - From structItem-cell--latest (updated for real HTML structure) threadLastPost: { selector: './/div[contains(@class,"structItem-cell--latest")]//time/@datetime', type: 'xpath', transform: mappingTransform(getPatternByKey('THREAD_LAST_POST_PATTERN')?.pipe || [ { type: 'date-format', locale: 'en', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'Asia/Singapore', }, ]), }, }; // Extract threads using THREAD_NODE_PATTERN const threadNodePattern = payload.data.patternPost.find((p) => p.key === 'THREAD_NODE_PATTERN') ?.pattern || '//div[contains(@class, "structItem--thread")]'; const threads = parser.extractStructuredList(response.data, threadNodePattern, // THREAD_NODE_PATTERN threadSchema, 'xpath', { verbose, baseUrl, }); console.log(`✅ Extracted ${threads.length} threads:`); console.log(); threads.forEach((thread, index) => { console.log(`📄 Thread ${index + 1}:`); console.log(` ID: ${thread.threadId}`); console.log(` Title: ${thread.threadTitle}`); console.log(` Link: ${thread.threadLink}`); console.log(` Replies: ${thread.threadReplies}`); console.log(` Views: ${thread.threadViews}`); const timestamp = thread.threadLastPost && !isNaN(thread.threadLastPost) ? thread.threadLastPost : Date.now() / 1000; const lastPostDate = new Date(timestamp * 1000); console.log(` Last Post: ${lastPostDate.toISOString()}`); console.log(); }); // 2. Reply extraction based on inspiration patternReply console.log('💬 Reply Extraction (patternReply)'); console.log('-'.repeat(40)); // Build reply schema from payload patterns const getReplyPatternByKey = (key) => payload.data.patternReply.find((p) => p.key === key); const replySchema = { // Author author: { selector: getReplyPatternByKey('author')?.pattern || './@data-author', type: 'xpath', }, // Reply date replyDate: { selector: getReplyPatternByKey('replyDate')?.pattern || './/time[@datetime]/@datetime', type: 'xpath', transform: mappingTransform(getReplyPatternByKey('replyDate')?.pipe || [ { type: 'date-format', locale: 'en', format: 'YYYY-MM-DDTHH:mm:ssZZ', timezone: 'Asia/Singapore', }, ]), }, // Reply ID replyId: { selector: getReplyPatternByKey('replyId')?.pattern || './@data-content', type: 'xpath', transform: mappingTransform(getReplyPatternByKey('replyId')?.pipe || [ { type: 'regex-replace', regex: '^post-', textReplacement: '', flag: 'g', }, ]), }, // Reply HTML replyHtml: { selector: getReplyPatternByKey('replyHtml')?.pattern || './/article[contains(@class,"message-body")]', type: 'xpath', raw: true, // Return HTML instead of text }, // Reply text (multiple text nodes) replyText: { selector: getReplyPatternByKey('replyText')?.pattern || './/article[contains(@class,"message-body")]/descendant-or-self::node()/text()[normalize-space()]', type: 'xpath', multiple: true, // Extract multiple text nodes }, // URL reply urlReply: { selector: getReplyPatternByKey('urlReply')?.pattern || './/div[contains(@class,"message-attribution-opposite")]//a/@href', type: 'xpath', transform: mappingTransform(getReplyPatternByKey('urlReply')?.pipe || [{ type: 'parse-as-url' }]), }, }; // Extract replies using container pattern from payload const replyContainerPattern = getReplyPatternByKey('container')?.pattern || '//article[contains(@class,"message--post")]'; const replies = parser.extractStructuredList(response.data, replyContainerPattern, // container pattern replySchema, 'xpath', { verbose, baseUrl, }); console.log(`✅ Extracted ${replies.length} replies:`); console.log(); replies.slice(0, 3).forEach((reply, index) => { // Show first 3 replies console.log(`💭 Reply ${index + 1}:`); console.log(` Author: ${reply.author}`); const replyTimestamp = reply.replyDate && !isNaN(reply.replyDate) ? reply.replyDate : Date.now() / 1000; const replyDate = new Date(replyTimestamp * 1000); console.log(` Date: ${replyDate.toISOString()}`); console.log(` ID: ${reply.replyId}`); console.log(` Text (first 100 chars): ${reply.replyText ? reply.replyText.join(' ').substring(0, 100) : 'N/A'}...`); console.log(` URL: ${reply.urlReply}`); console.log(); }); // 3. Pagination and section info console.log('📄 Page Information'); console.log('-'.repeat(40)); // Extract section title using payload pattern const sectionTitlePattern = getPatternByKey('SECTION_TITLE')?.pattern || '//head/title/text()'; const sectionTitle = parser.extractSingle(response.data, sectionTitlePattern, 'xpath'); console.log(`📋 Section Title: ${sectionTitle}`); // Extract pagination info using payload pattern const nextPagePattern = getReplyPatternByKey('nextPage')?.pattern || '//ul[contains(@class,"pageNav-main")]/li/a[@href and not(@href="#")]/@href'; const nextPageUrls = parser.extractMultiple(response.data, nextPagePattern, 'xpath', undefined, { baseUrl, transform: mappingTransform([{ type: 'parse-as-url' }]), }); console.log(`🔗 Available page URLs: ${nextPageUrls.length}`); nextPageUrls.slice(0, 5).forEach((url, index) => { console.log(` ${index + 1}. ${url}`); }); // 4. Summary statistics console.log(); console.log('📊 Scraping Summary'); console.log('-'.repeat(40)); console.log(`🎯 Endpoint: ${endPoint}`); console.log(`📝 Threads extracted: ${threads.length}`); console.log(`💬 Replies extracted: ${replies.length}`); console.log(`📄 Pagination URLs: ${nextPageUrls.length}`); console.log(`📋 Section: ${sectionTitle}`); console.log(`🏷️ Media ID: ${payload.data.mediaId}`); console.log(`🌐 Origin: ${payload.data.origin}`); console.log(`⏱️ Interval: ${payload.data.interval}s`); console.log(`📄 Page: ${payload.data.page}`); console.log(`🔧 Engine: ${payload.data.engine}`); // Show some pipe transformation examples console.log(); console.log('🔧 Live Pipe Transformations'); console.log('-'.repeat(40)); if (threads.length > 0) { const sampleThread = threads[0]; console.log('📝 Thread ID extraction:'); console.log(` Raw class: "${sampleThread.threadId}"`); console.log(` Regex pattern: "\\w*$" → Extract last word`); console.log(); } if (replies.length > 0) { const sampleReply = replies[0]; console.log('💭 Reply ID transformation:'); console.log(` Raw data-content: "post-${sampleReply.replyId}"`); console.log(` Regex replace: "^post-" → "" (remove prefix)`); console.log(` Result: "${sampleReply.replyId}"`); console.log(); } } catch (error) { console.error('❌ Error during BMW-SG forum scraping:', error); throw error; } } // Run the scraper if this file is executed directly if (require.main === module) { (async () => { try { await scrapeBmwSgForum(false); console.log('🎉 BMW-SG forum scraping completed successfully!'); } catch (error) { console.error('💥 Scraping failed:', error); process.exit(1); } })(); } //# sourceMappingURL=bmw-sg.com.js.map