UNPKG

pagehtml

Version:

A tool to grab and process a website's html.

331 lines (302 loc) 11.8 kB
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>JSDoc: Source: src/htmlScraper.js</title> <script src="scripts/prettify/prettify.js"> </script> <script src="scripts/prettify/lang-css.js"> </script> <!--[if lt IE 9]> <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script> <![endif]--> <link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css"> <link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css"> </head> <body> <div id="main"> <h1 class="page-title">Source: src/htmlScraper.js</h1> <section> <article> <pre class="prettyprint source linenums"><code>"use strict"; const jsdom = require("jsdom"); const { JSDOM } = jsdom; const UserAgent = require('user-agents'); const puppeteer = require('rebrowser-puppeteer'); const makeTable = require('../utils/makeTable'); const getLinks = require('../utils/getLinks'); const getElementText = require('../utils/getElement'); /** A class representing HTML pages. */ class PageHTML { constructor() { this.dom = []; this.userAgentObject = new UserAgent({deviceCategory: 'desktop'}); this.userAgent = this.userAgentObject.data.userAgent.toString(); this.screenWidth = this.userAgentObject.data.screenWidth; this.screenHeight = this.userAgentObject.data.screenHeight; this.platform = this.userAgentObject.data.platform; this.page = null; this.browser = null; }; /** * A private method to generate the browser and page instances. * @returns {{browser, page}} browser and page object, elements of Puppeteer class. */ async #createPage() { if (this.browser === null || this.page === null) { const browser = await puppeteer.launch({headless: false, defaultViewport: {width: this.screenWidth, height: this.screenHeight}, ignoreDefaultArgs: ['--enable-automation'], args: ['--disable-blink-features=AutomationControlled'], }); const page = await browser.newPage(); await page.setUserAgent(this.userAgent); this.browser = browser; this.page = page; return {browser: browser, page: page}; } return {browser: this.browser, page: this.page}; } /** * A function that retrieves data from a webpage. * @param {string} url The target url from which to grab data. * @returns {jsdom.JSDOM} The document object model (dom). */ async get(url, referer) { const pageObject = await this.#createPage(); const page = pageObject.page; if (typeof(url) === "string") { await page.goto(url, {referer: referer}); var response = await page.content() var dom = new JSDOM(response); this.dom.push(dom); } else if (url === undefined) { var response = await page.content() var dom = new JSDOM(response); this.dom.push(dom); } return this.dom; } /** * A method to clear the array of the dom property of the class. * @returns {null} */ clear() { this.dom.length = 0; return null; } /** * A method to return all tables present in an instance of the JSDOM class. * @param {number} index The index of the dom property to perform this method on. * This may be a single number or an array of numbers. If left undefined it will apply to all indexes. * @returns {string[]} an array of arrays representing rows and columns of an html table. * @example * ```javascript * * // create the PageHTML class. * let pHtml = new PageHTML(); * * // grab the webpage content. * await pHtml.get('https://en.wikipedia.org/wiki/List_of_Formula_One_Grand_Prix_winners'); * * // close the webpage. * pHtml.close(); * * // setting index to 0 returns the tables for the 0th JSDOM element in the pHtml.dom property. * // additionally we set the array index to 1 to confirm we only want the second table. * let links = pHtml.tables(0)[1]; * console.log(tables); * ``` * returns the second html table from the webpage as an array of arrays. */ tables(index) { if (index === undefined) { var tables = []; for (let dom of this.dom) { tables.push(makeTable(dom)); } return tables; } else if (typeof(index) === 'number') { try { return makeTable(this.dom[index]); } catch (error) { if (error instanceof TypeError) { console.log(`Invalid Index: There is no JSDOM at index ${index}.`); } else { // All other errors. console.error("Error:", error.message); } } } else if (Array.isArray(index)) { var tables = []; for (let i of index) { try { tables.push(makeTable(this.dom[i])); } catch (error) { if (error instanceof TypeError) { console.log(`Invalid Index: There is no JSDOM at index ${i}.`); break; } else { // All other errors. console.error("Error:", error.message); } } } return tables; } } /** * A method that returns a links object containing relevant information to any href elements in the instance of a JSDOM class. * @param {number} index The index of the dom property to perform this method on. * This may be a single number or an array of numbers. If left undefined it will apply to all indexes. * @returns {Array.&lt;{href: string, * nodeName: string, * outerHTML: string, * innerHTML: string, * parentElement: string}>} an array of link objects. * * @example * ```javascript * * // create the PageHTML class. * let pHtml = new PageHTML(); * * // grab the webpage content. * await pHtml.get('https://en.wikipedia.org/wiki/List_of_Formula_One_Grand_Prix_winners'); * * // close the webpage. * pHtml.close(); * * // setting index to 0 returns the link objects for the 0th JSDOM element in the pHtml.dom property. * // additionally we set the array index to 2 to confirm we only want the third link. * let links = pHtml.links(0)[2]; * console.log(tables); * ``` * returns the third href link from the webpage as a link object. */ links(index) { if (index === undefined) { var links = []; for (let dom of this.dom) { links.push(getLinks(dom)); } return links; } else if (typeof(index) === 'number') { try { return getLinks(this.dom[index]); } catch (error) { if (error instanceof TypeError) { console.log(`Invalid Index: There is no JSDOM at index ${index}.`); } else { // All other errors. console.error("Error:", error.message); } } } else if (Array.isArray(index)) { var links = []; for (let i of index) { try { links.push(getLinks(this.dom[i])); } catch (error) { if (error instanceof TypeError) { console.log(`Invalid Index: There is no JSDOM at index ${i}.`); break; } else { // All other errors. console.error("Error:", error.message); } } } return links; } } /** * A method that returns a content object containing the text content * and other relevant parameters for an HTML element. * @param {number|number[]} index The index of the dom property to perform this method on. * This may be a single number or an array of numbers. If left undefined it will apply to all indexes. * @param {string} elementString * @returns {Array.&lt;{elementText: string, * nodeName: string, * outerHTML: string, * innerHTML: string, * parentElement: string}>} an array of element objects. * * @example * ```javascript * // create the PageHTML class. * let pHtml = new PageHTML(); * * // load the webpage. * await pHtml.get('https://www.whatsmyua.info/'); * * // close the connection. * pHtml.close(); * * // return the list element with id rawUa. * // setting index to 0 returns the 0th JSDOM element in the pHtml.dom property. * // additionally we set the array index to 0 to confirm we only want the first instance. * let userAgentDetected = pHtml.content('li#rawUa',0)[0]; * * console.log(userAgentDetected); * ``` * returns the string user agent detected by www.whatsmyua.info */ content(elementString, index) { if (index === undefined) { var content = []; for (let dom of this.dom) { content.push(getElementText(dom,elementString)); } return content; } else if (typeof(index) === 'number') { try { return getElementText(this.dom[index], elementString); } catch (error) { if (error instanceof TypeError) { console.log(`Invalid Index: There is no JSDOM at index ${index}.`); } else { // All other errors. console.error("Error:", error.message); } } } else if (Array.isArray(index)) { var content = []; for (let i of index) { try { content.push(getElementText(this.dom[i],elementString)); } catch (error) { if (error instanceof TypeError) { console.log(`Invalid Index: There is no JSDOM at index ${i}.`); break; } else { // All other errors. console.error("Error:", error.message); } } } return content; } } /** * Closes the browser instance of Puppeteer and sets the page and browser properties of the PageHTML class to null. */ close() { this.browser.close(); this.page = null; this.browser = null; } }; module.exports = PageHTML; </code></pre> </article> </section> </div> <nav> <h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="PageHTML.html">PageHTML</a></li></ul><h3>Global</h3><ul><li><a href="global.html#getElementText">getElementText</a></li><li><a href="global.html#getLinks">getLinks</a></li><li><a href="global.html#makeTable">makeTable</a></li></ul> </nav> <br class="clear"> <footer> Documentation generated by <a href="https://github.com/jsdoc/jsdoc">JSDoc 4.0.4</a> on Wed Feb 05 2025 14:23:55 GMT-0800 (Pacific Standard Time) </footer> <script> prettyPrint(); </script> <script src="scripts/linenumber.js"> </script> </body> </html>