UNPKG

bnu-scrapper

Version:

A module to scrape PDF links from Bengaluru North University syllabus page

87 lines (74 loc) 2.77 kB
const axios = require('axios'); const cheerio = require('cheerio'); const url = require('url'); const https = require('https'); const axiosInstance = axios.create({ httpsAgent: new https.Agent({ rejectUnauthorized: false }) }); class BnuPdfScraper { constructor() { this.baseUrl = 'https://bnu.karnataka.gov.in'; this.targetUrl = `${this.baseUrl}/page/Students+Corner/Syllabus+and+Text+Books/en`; this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'; } async scrapePdfLinks() { try { const response = await axiosInstance.get(this.targetUrl, { headers: { 'User-Agent': this.userAgent } }); const $ = cheerio.load(response.data); const pdfLinks = []; $('#table-archive p a').each((i, element) => { const href = $(element).attr('href'); if (href && (href.toLowerCase().endsWith('.pdf') || href.toLowerCase().includes('pdf'))) { const fullUrl = url.resolve(this.baseUrl, href); pdfLinks.push({ title: $(element).text().trim(), url: fullUrl }); } }); $("a:contains('UUCMS and Sylabus')").next('ul').find('a').each((i, element) => { const href = $(element).attr('href'); if (href && (href.toLowerCase().endsWith('.pdf') || href.toLowerCase().includes('pdf'))) { const fullUrl = url.resolve(this.baseUrl, href); pdfLinks.push({ title: $(element).text().trim(), url: fullUrl }); } }); return pdfLinks; } catch (error) { console.error(`Error during scraping: ${error.message}`); return []; } } async scrapeToJson() { try { const pdfLinks = await this.scrapePdfLinks(); const jsonData = { source: this.targetUrl, timestamp: new Date().toISOString(), totalLinks: pdfLinks.length, links: pdfLinks }; return jsonData; } catch (error) { console.error(`Error generating JSON: ${error.message}`); throw error; } } } module.exports = BnuPdfScraper; if (require.main === module) { (async () => { const scraper = new BnuPdfScraper(); const result = await scraper.scrapeToJson(); console.log(JSON.stringify(result, null, 2)); })(); }