UNPKG

catalog.js

Version:

A JavaScript / TypeScript API for the Middlebury course catalog.

145 lines 5.21 kB
import xml2js from "xml2js"; import Catalog from "./Catalog.js"; import Param from "./Param.js"; import fs from "fs"; const MIDD_URL_BASE = "http://catalog.middlebury.edu/offerings/searchxml/catalog-MCUG?"; const DEFAULT_SEARCH_PARAMS = [ new Param("type%5B%5D", "genera%3Aoffering%2FLCT").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FLAB").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FDSC").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FDR1").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FDR2").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FPE").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FPLB").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FSCR").getObject(), new Param("type%5B%5D", "genera%3Aoffering%2FSEM").getObject(), new Param("location%5B%5D", "resource%2Fplace%2Fcampus%2FM").getObject(), new Param("search", "Search").getObject(), ]; /** * A scraper class to scrape the data from the course page * */ export default class Scraper { catalog; raw; href; term; searchParameters; constructor(obj) { let { term, searchParameters, filepath } = obj; if (!term && !filepath) { throw new Error("Term or file must be defined"); } if (!searchParameters) { searchParameters = DEFAULT_SEARCH_PARAMS; } if (term) { this.term = this._convertTerm(term); this.searchParameters = searchParameters.map((param) => { return { ...param }; }); this.href = this._getScraperURL(); //default parameters when no scrape has been performed this.raw = ""; //this.catalog = new Catalog({}); } if (filepath) { this.href = filepath; this.raw = ""; //this.catalog = new Catalog({}); } } _convertTerm(term) { /** if term is of format YYYYSS, return term */ if (term.match(/^[0-9]{6}$/)) { return term; } /** if term is of format SYY, convert to YYYYSS */ if (term.match(/([A-Z]){1}([0-9]){2}$/)) { const seasons = { F: "90", W: "10", S: "20", }; const century = "20"; const partialYear = term.slice(1); const fullYear = `${century}${partialYear}`; const season = term.charAt(0); const seasonCode = seasons[season]; return `${fullYear}${seasonCode}`; } throw new Error(`Invalid term format: ${term}`); } async getCatalog() { if (!this.raw) { await this.scrape(); } if (!this.catalog) { await this.parse(); } return this.catalog; } async getCatalogFromFile(filepath) { this.href = filepath; if (!this.raw) { await this.scrapeFromFile(filepath); } await this.parseFromFile(filepath); return this.catalog; } async scrapeFromFile(filepath) { const file = fs.readFileSync(filepath, "utf8"); const parser = new xml2js.Parser(); const json = JSON.stringify(await parser.parseStringPromise(file)); this.raw = json; return this; } async parseFromFile(filepath) { const catalog = new Catalog({ raw: this.raw, href: filepath }); this.catalog = catalog; return this; } async scrape() { const response = await global.fetch(this.href, { headers: { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', "Accept": 'text/html,application/xhtml+xml,application/xml;', "Accept-Encoding": 'gzip, deflate, br', "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "catalog.middlebury.edu" } }); if (!response.ok) { throw new Error(`Could not fetch ${this.href}`); } const xml = await response.text(); const parser = new xml2js.Parser(); const json = JSON.stringify(await parser.parseStringPromise(xml)); this.raw = json; return this; } async parse() { if (!this.raw) { throw new Error("Scraper must scrape before parsing"); } const catalog = new Catalog({ raw: this.raw, href: this.href }); this.catalog = catalog; return this; } _getScraperURL() { let url = `${MIDD_URL_BASE}`; if (this.term) { url += `term=term-${this.term}`; } if (!this.searchParameters) { return url; } this.searchParameters.forEach((param) => { url += `&${param.name}=${param.value}`; }); return url; } } //# sourceMappingURL=Scraper.js.map