UNPKG

pupget

Version:

A simple webscrapping package to extract data from a DSL

119 lines (109 loc) 3.32 kB
const puppeteer = require("puppeteer"); const fs = require("fs"); const path = require("path"); // Init and output data object let pupgetInitData = {}, pupgetOutputData = {}; // Starts scraping //process.argv[2] ? readFromStdin() : scrape().catch(console.error); /** * Verifies if the JSON file was passed through standard input */ function readFromStdin() { fs.readFile(process.argv[2], async (err, data) => { if (err) throw err; pupgetInitData = await JSON.parse(data); scrape().catch(console.error); }); } function readFromFile(file) { fs.readFile(file, async (err, data) => { if (err) throw err; pupgetInitData = await JSON.parse(data); scrape().catch(console.error); }); } /** * Read a JSON text with a valid pupget JSON and fed the main pupget object * @param {string} jsonText a valid pupget JSON file */ function readFromInnerJson(jsonText) { pupgetInitData = JSON.parse(jsonText); } /** * Creates the output folder * @param {string} folder the name of the folder */ function createOutputFolder(folder) { if (!fs.existsSync(folder)) { fs.mkdirSync(folder); } } /** * This function will be executed inside the page * @param {string} selector * @returns CSS selector to use in the DOM query */ function myPageFunction(selector) { // Find the first thing matching our selector on the page const elementNode = document.querySelector(selector); // Get the element's innerHTML const html = elementNode.innerHTML; // Return the innerHTML to the code running in Node return html; } /** * Main function */ async function scrape() { // Create main results folder createOutputFolder(pupgetInitData.output.folder); // Create the browser const browser = await puppeteer.launch(); // Main scraping try { // Create a new page (tab) in the browser const page = await browser.newPage(); // Go to base page await page.goto(pupgetInitData.base_page); // Main steps cycle for (const step of pupgetInitData.steps) { if (step.query.results) { const hrefs = await page.$$eval(step.query.query, (links) => links.map((a) => a.href) ); let id = 0; for (const href of hrefs) { pupgetOutputData = { [step.query.output_field]: {} }; const page = await browser.newPage(); await page.goto(href); const pupgetRoot = pupgetOutputData[step.query.output_field]; for (const field of step.query.results) { pupgetRoot[field.output_field] = await page.evaluate( myPageFunction, field.query ); } // Save file fs.writeFileSync( path.join( pupgetInitData.output.folder, `${pupgetInitData.output.filename}${id++}.json` ), JSON.stringify(pupgetOutputData) ); } } } // Catch and log errors } catch (error) { // Handle errors console.error(error); } finally { // Always close the browser await browser.close(); } } exports.readFromFile = readFromFile; exports.readFromInnerJson = readFromInnerJson; exports.scrape = scrape;