bnu-scrapper
Version:
A module to scrape PDF links from Bengaluru North University syllabus page
87 lines (74 loc) • 2.77 kB
JavaScript
const axios = require('axios');
const cheerio = require('cheerio');
const url = require('url');
const https = require('https');
const axiosInstance = axios.create({
httpsAgent: new https.Agent({
rejectUnauthorized: false
})
});
class BnuPdfScraper {
constructor() {
this.baseUrl = 'https://bnu.karnataka.gov.in';
this.targetUrl = `${this.baseUrl}/page/Students+Corner/Syllabus+and+Text+Books/en`;
this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0';
}
async scrapePdfLinks() {
try {
const response = await axiosInstance.get(this.targetUrl, {
headers: {
'User-Agent': this.userAgent
}
});
const $ = cheerio.load(response.data);
const pdfLinks = [];
$('#table-archive p a').each((i, element) => {
const href = $(element).attr('href');
if (href && (href.toLowerCase().endsWith('.pdf') || href.toLowerCase().includes('pdf'))) {
const fullUrl = url.resolve(this.baseUrl, href);
pdfLinks.push({
title: $(element).text().trim(),
url: fullUrl
});
}
});
$("a:contains('UUCMS and Sylabus')").next('ul').find('a').each((i, element) => {
const href = $(element).attr('href');
if (href && (href.toLowerCase().endsWith('.pdf') || href.toLowerCase().includes('pdf'))) {
const fullUrl = url.resolve(this.baseUrl, href);
pdfLinks.push({
title: $(element).text().trim(),
url: fullUrl
});
}
});
return pdfLinks;
} catch (error) {
console.error(`Error during scraping: ${error.message}`);
return [];
}
}
async scrapeToJson() {
try {
const pdfLinks = await this.scrapePdfLinks();
const jsonData = {
source: this.targetUrl,
timestamp: new Date().toISOString(),
totalLinks: pdfLinks.length,
links: pdfLinks
};
return jsonData;
} catch (error) {
console.error(`Error generating JSON: ${error.message}`);
throw error;
}
}
}
module.exports = BnuPdfScraper;
if (require.main === module) {
(async () => {
const scraper = new BnuPdfScraper();
const result = await scraper.scrapeToJson();
console.log(JSON.stringify(result, null, 2));
})();
}