UNPKG

newspk

Version:

A TypeScript-based Node.js library for fetching latest news from Pakistani news sources (Dawn News) in English and Urdu.

github.com/MazanLabeeb/newspk

MazanLabeeb/newspk

96 lines • 4.03 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.fetchNews = void 0; const node_fetch_1 = __importDefault(require("node-fetch")); const jsdom_1 = require("jsdom"); const striptags_1 = __importDefault(require("striptags")); /** * Fetches the full article content from a given URL * @param url - The URL of the article to fetch * @returns Promise resolving to the article body text * @throws Error if the article content cannot be fetched or parsed */ const getNewsContent = async (url) => { try { const response = await (0, node_fetch_1.default)(url); const body = await response.text(); const dom = new jsdom_1.JSDOM(body); const contentElement = dom.window.document.querySelector('.story__content'); if (!contentElement) { throw new Error('Content element not found on page'); } let articleContent = ''; for (let i = 0; i < contentElement.children.length; i++) { if (contentElement.children[i].tagName === 'P') { const childElement = contentElement.children[i]; articleContent += (0, striptags_1.default)(childElement.innerHTML.trim()); } } return articleContent; } catch (error) { console.error(`Error fetching news content from ${url}:`, error); throw error; } }; /** * Fetches latest news articles from Dawn News website * @param limit - Number of articles to fetch (default: 5, max: 15) * @param lang - Language for news ('english' or 'urdu') (default: 'urdu') * @returns Promise resolving to an array of news articles * @throws Error if the page cannot be fetched or parsed */ const fetchNews = async (limit = 5, lang = 'urdu') => { try { // Validate input const validLimit = Math.min(Math.max(limit, 1), 15); const newsUrl = lang.toLowerCase() === 'urdu' ? 'https://www.dawnnews.tv/latest-news' : 'https://www.dawn.com/latest-news'; // Fetch the main page const response = await (0, node_fetch_1.default)(newsUrl); const body = await response.text(); const dom = new jsdom_1.JSDOM(body); // Extract article elements const storyLinks = dom.window.document.getElementsByClassName('story__link'); const mediaItems = dom.window.document.getElementsByClassName('media__item'); const timestamps = dom.window.document.getElementsByClassName('timeago'); const articles = []; // Process each article for (let i = 0; i < validLimit && i < mediaItems.length; i++) { try { const mediaItem = mediaItems[i]; const firstChild = mediaItem.firstChild; const articleUrl = firstChild.href; // Extract unique ID from URL const uniqueId = articleUrl.split('/')[4] || `article_${i}`; // Fetch article content const articleBody = await getNewsContent(articleUrl); // Extract article data const storyLink = storyLinks[i]; const imgElement = firstChild.firstChild?.firstChild; articles.push({ title: storyLink.innerHTML.trim(), thumbnail: imgElement?.src || '', body: articleBody, unique_id: uniqueId, created_at: timestamps[i]?.title || new Date().toISOString(), }); } catch (error) { console.warn(`Error processing article at index ${i}:`, error); continue; } } return articles; } catch (error) { console.error('Error fetching news:', error); throw error; } }; exports.fetchNews = fetchNews; //# sourceMappingURL=scraper.js.map