newspk
Version:
A TypeScript-based Node.js library for fetching latest news from Pakistani news sources (Dawn News) in English and Urdu.
96 lines • 4.03 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fetchNews = void 0;
const node_fetch_1 = __importDefault(require("node-fetch"));
const jsdom_1 = require("jsdom");
const striptags_1 = __importDefault(require("striptags"));
/**
* Fetches the full article content from a given URL
* @param url - The URL of the article to fetch
* @returns Promise resolving to the article body text
* @throws Error if the article content cannot be fetched or parsed
*/
const getNewsContent = async (url) => {
try {
const response = await (0, node_fetch_1.default)(url);
const body = await response.text();
const dom = new jsdom_1.JSDOM(body);
const contentElement = dom.window.document.querySelector('.story__content');
if (!contentElement) {
throw new Error('Content element not found on page');
}
let articleContent = '';
for (let i = 0; i < contentElement.children.length; i++) {
if (contentElement.children[i].tagName === 'P') {
const childElement = contentElement.children[i];
articleContent += (0, striptags_1.default)(childElement.innerHTML.trim());
}
}
return articleContent;
}
catch (error) {
console.error(`Error fetching news content from ${url}:`, error);
throw error;
}
};
/**
* Fetches latest news articles from Dawn News website
* @param limit - Number of articles to fetch (default: 5, max: 15)
* @param lang - Language for news ('english' or 'urdu') (default: 'urdu')
* @returns Promise resolving to an array of news articles
* @throws Error if the page cannot be fetched or parsed
*/
const fetchNews = async (limit = 5, lang = 'urdu') => {
try {
// Validate input
const validLimit = Math.min(Math.max(limit, 1), 15);
const newsUrl = lang.toLowerCase() === 'urdu'
? 'https://www.dawnnews.tv/latest-news'
: 'https://www.dawn.com/latest-news';
// Fetch the main page
const response = await (0, node_fetch_1.default)(newsUrl);
const body = await response.text();
const dom = new jsdom_1.JSDOM(body);
// Extract article elements
const storyLinks = dom.window.document.getElementsByClassName('story__link');
const mediaItems = dom.window.document.getElementsByClassName('media__item');
const timestamps = dom.window.document.getElementsByClassName('timeago');
const articles = [];
// Process each article
for (let i = 0; i < validLimit && i < mediaItems.length; i++) {
try {
const mediaItem = mediaItems[i];
const firstChild = mediaItem.firstChild;
const articleUrl = firstChild.href;
// Extract unique ID from URL
const uniqueId = articleUrl.split('/')[4] || `article_${i}`;
// Fetch article content
const articleBody = await getNewsContent(articleUrl);
// Extract article data
const storyLink = storyLinks[i];
const imgElement = firstChild.firstChild?.firstChild;
articles.push({
title: storyLink.innerHTML.trim(),
thumbnail: imgElement?.src || '',
body: articleBody,
unique_id: uniqueId,
created_at: timestamps[i]?.title || new Date().toISOString(),
});
}
catch (error) {
console.warn(`Error processing article at index ${i}:`, error);
continue;
}
}
return articles;
}
catch (error) {
console.error('Error fetching news:', error);
throw error;
}
};
exports.fetchNews = fetchNews;
//# sourceMappingURL=scraper.js.map