@hanivanrizky/nestjs-html-parser
Version:
A powerful NestJS HTML parsing service with XPath and CSS selector support, proxy configuration, random user agents, and rich response metadata including headers and status codes
211 lines • 12.4 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.demonstrateWatanocParser = demonstrateWatanocParser;
const html_parser_service_1 = require("../html-parser.service");
async function demonstrateWatanocParser() {
const parser = new html_parser_service_1.HtmlParserService();
try {
// Fetch HTML from watanoc.com - Japanese learning website
console.log('Fetching HTML from watanoc.com...');
const response = await parser.fetchHtml('https://watanoc.com/');
const html = response.data;
console.log('\n=== Watanoc.com - Japanese Learning Website Examples ===');
// Extract site information
const siteTitle = parser.extractSingle(html, '//title/text()');
console.log('Site title:', siteTitle);
const siteDescription = parser.extractSingle(html, '//meta[@name="description"]/@content');
console.log('Site description:', siteDescription);
console.log('\n=== Article Extraction Examples ===');
// Extract article titles with difficulty levels
const articleTitles = parser.extractMultiple(html, '//h1[contains(@class, "entry-title") or contains(@id, "article")]//text() | //h2[contains(text(), "n4") or contains(text(), "n5") or contains(text(), "n3")]//text()');
console.log('Article titles (first 5):', articleTitles.slice(0, 5));
// Extract difficulty levels using CSS selectors
const difficultyLevels = parser
.extractMultiple(html, 'h1, h2, h3', 'css')
.filter((title) => title.includes('n4') || title.includes('n5') || title.includes('n3'));
console.log('Articles with difficulty levels:', difficultyLevels.slice(0, 5));
console.log('\n=== Navigation and Category Examples ===');
// Extract navigation categories
const categories = parser.extractMultiple(html, '//nav//a/text() | //ul//li//a[contains(@href, "category") or contains(text(), "しょくじ") or contains(text(), "かんこう") or contains(text(), "イベント")]/text()');
console.log('Website categories:', categories.slice(0, 10));
// Extract category links
const categoryLinks = parser.extractAttributes(html, '//a[contains(@href, "category") or contains(text(), "しょくじ") or contains(text(), "かんこう")]', 'href');
console.log('Category links:', categoryLinks.slice(0, 5));
// === Raw HTML Extraction Example ===
console.log('\n=== Raw HTML Extraction Example ===');
const rawSchema = {
navHtml: {
selector: '//nav',
type: 'xpath',
raw: true,
},
firstArticleTitleHtml: {
selector: '(//h1[contains(@class, "entry-title") or contains(@id, "article")])[1]',
type: 'xpath',
raw: true,
},
};
const rawResult = parser.extractStructured(html, rawSchema);
console.log('Raw nav HTML:', rawResult.navHtml?.slice(0, 200) + '...');
console.log('Raw first article title HTML:', rawResult.firstArticleTitleHtml);
console.log('\n=== Article Metadata Extraction ===');
// Define schema for article extraction
const articleSchema = {
title: {
selector: './/h1/text() | .//h2/text() | .//h3/text()',
type: 'xpath',
},
englishTitle: {
selector: './/br/following-sibling::text()[1] | .//text()[contains(., "Shaved ice") or contains(., "Oiled Ramen") or contains(., "Vietnamese sandwich")]',
type: 'xpath',
},
difficulty: {
selector: './/text()[contains(., "n3") or contains(., "n4") or contains(., "n5")]',
type: 'xpath',
transform: (value) => {
const match = value.match(/(n[3-5])/);
return match ? match[1] : null;
},
},
author: {
selector: './/text()[contains(., "avatar")]',
type: 'xpath',
transform: (value) => value.replace('avatar', '').trim(),
},
date: {
selector: './/text()[contains(., "年") and contains(., "月") and contains(., "日")]',
type: 'xpath',
},
category: {
selector: './/text()[contains(., "食事") or contains(., "観光") or contains(., "イベント") or contains(., "文化")]',
type: 'xpath',
},
};
console.log('\n=== Food Articles (しょくじ) ===');
// Extract food-related articles
const foodArticles = parser.extractStructuredList(html, '//div[contains(text(), "肉玉そば") or contains(text(), "油そば") or contains(text(), "牛丼") or contains(text(), "ぎょうざ") or contains(text(), "ラーメン")]/..', {
title: {
selector: './/text()[contains(., "そば") or contains(., "丼") or contains(., "ラーメン")]',
type: 'xpath',
},
difficulty: {
selector: './/text()[contains(., "n4") or contains(., "n5")]',
type: 'xpath',
transform: (value) => {
const match = value.match(/(n[4-5])/);
return match ? match[1] : null;
},
},
author: {
selector: './/text()[contains(., "yusuke") or contains(., "さくら") or contains(., "すずき")]',
type: 'xpath',
},
});
console.log('Food articles found:', foodArticles.length);
foodArticles.slice(0, 3).forEach((article, index) => {
console.log(`\nFood Article ${index + 1}:`);
console.log(` Title: ${article.title || 'N/A'}`);
console.log(` Difficulty: ${article.difficulty || 'N/A'}`);
console.log(` Author: ${article.author || 'N/A'}`);
});
console.log('\n=== Popular Articles Extraction ===');
// Extract popular articles using CSS selectors
const popularArticlesTitles = parser.extractMultiple(html, 'h4, .popular-article-title, [class*="popular"] h3, [class*="popular"] h4', 'css');
console.log('Popular article titles:', popularArticlesTitles.slice(0, 5));
console.log('\n=== Language Detection and Content ===');
// Extract Japanese learning content
const japaneseContent = parser.extractMultiple(html, '//text()[contains(., "ひらがな") or contains(., "カタカナ") or contains(., "漢字") or contains(., "です") or contains(., "ます")]');
console.log('Japanese learning content samples:', japaneseContent.slice(0, 5));
// Extract English translations
const englishTranslations = parser.extractMultiple(html, '//text()[contains(., "sandwich") or contains(., "Festival") or contains(., "Noodles") or contains(., "ice")]');
console.log('English translations found:', englishTranslations.slice(0, 5));
console.log('\n=== Author and Date Information ===');
// Extract author information
const authors = parser.extractMultiple(html, '//text()[contains(., "yusuke") or contains(., "さくら") or contains(., "すずき") or contains(., "和タのC")]');
console.log('Authors found:', [...new Set(authors)]);
// Extract Japanese dates
const japaneseDates = parser.extractMultiple(html, '//text()[contains(., "2016年") and contains(., "月") and contains(., "日")]');
console.log('Article dates (first 5):', japaneseDates.slice(0, 5));
console.log('\n=== Tag and Category Analysis ===');
// Extract tags
const tags = parser.extractMultiple(html, '//text()[contains(., "N4-Pre-intermediate") or contains(., "N5-Beginner") or contains(., "N3-Intermediate") or contains(., "Listening")]');
console.log('Learning tags:', tags);
// Extract country tags
const countryTags = parser.extractMultiple(html, '//text()[contains(., "ベトナム") or contains(., "インドネシア") or contains(., "タイ") or contains(., "カナダ") or contains(., "アメリカ")]');
console.log('Country tags:', countryTags);
console.log('\n=== Comment and Interaction Data ===');
// Extract comment information
const comments = parser.extractMultiple(html, '//text()[contains(., "に ") and contains(., "より")]');
console.log('Comment indicators (first 5):', comments.slice(0, 5));
console.log('\n=== Advanced: Calendar and Popular Content ===');
// Extract calendar information
const calendarData = parser.extractMultiple(html, '//table//td/text() | //text()[contains(., "2025年")]');
console.log('Calendar data found:', calendarData.filter((item) => item.trim()).slice(0, 10));
// Extract ranking numbers
const rankings = parser.extractMultiple(html, '//ul[@class="recent_entries thumb"]//text()[. >= "1" and . <= "7" and string-length(.) = 1]');
console.log('Article rankings:', rankings);
console.log('\n=== Complete Article Structure Example ===');
// Comprehensive article extraction with mixed selectors
const comprehensiveSchema = {
japaneseTitle: {
selector: '//text()[contains(., "そば") or contains(., "アイス") or contains(., "フェス")]',
type: 'xpath',
},
englishTitle: {
selector: 'text',
type: 'css',
transform: (value) => {
const englishMatch = value.match(/([A-Za-z\s]+(?:sandwich|Festival|Noodles|ice|foods))/i);
return englishMatch ? englishMatch[1] : null;
},
},
difficulty: {
selector: '//text()[contains(., "n3") or contains(., "n4") or contains(., "n5")]',
type: 'xpath',
transform: (value) => {
const match = value.match(/(n[3-5])/);
return match ? match[1] : null;
},
},
isPopular: {
selector: '//text()[contains(., "1") or contains(., "2") or contains(., "3")]',
type: 'xpath',
transform: (value) => parseInt(value) <= 7,
},
hasEnglishTranslation: {
selector: '//text()[contains(., "br") or contains(., "<br>")]',
type: 'xpath',
transform: (value) => value.includes('br'),
},
};
const siteOverview = parser.extractStructured(html, comprehensiveSchema);
console.log('Site overview analysis:', siteOverview);
console.log('\n=== Educational Content Analysis ===');
// Check for different learning elements
const hasListening = parser.exists(html, '//text()[contains(., "リスニング") or contains(., "Listening")]');
const hasQuiz = parser.exists(html, '//text()[contains(., "クイズ") or contains(., "Quiz")]');
const hasAudio = parser.exists(html, '//text()[contains(., "audio") or contains(., "オーディオ")]');
console.log('Educational features:');
console.log(` Has listening exercises: ${hasListening}`);
console.log(` Has quizzes: ${hasQuiz}`);
console.log(` Has audio content: ${hasAudio}`);
// Count different difficulty levels
const n3Count = parser.count(html, '//text()[contains(., "n3")]');
const n4Count = parser.count(html, '//text()[contains(., "n4")]');
const n5Count = parser.count(html, '//text()[contains(., "n5")]');
console.log('Content distribution by difficulty:');
console.log(` N3 (Intermediate): ${n3Count} items`);
console.log(` N4 (Pre-intermediate): ${n4Count} items`);
console.log(` N5 (Beginner): ${n5Count} items`);
}
catch (error) {
console.error('Error demonstrating Watanoc parser:', error.message);
}
}
// Run the demonstration if this file is executed directly
if (require.main === module) {
demonstrateWatanocParser()
.then(() => console.log('\nWatanoc.com parsing demo completed!'))
.catch(console.error);
}
//# sourceMappingURL=watanoc.com.js.map
;