UNPKG

contract-scraper

Version:

A customisable data scraper for the web based on JSON contracts

134 lines (133 loc) 5.59 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const background_image_1 = __importDefault(require("./src/attribute/background-image")); const link_1 = __importDefault(require("./src/attribute/link")); const number_1 = __importDefault(require("./src/attribute/number")); const size_1 = __importDefault(require("./src/attribute/size")); const text_1 = __importDefault(require("./src/attribute/text")); const puppeteer_1 = __importDefault(require("./src/fetcher/puppeteer")); const html_1 = __importDefault(require("./src/provider/html")); const script_tag_1 = __importDefault(require("./src/provider/script-tag")); const contract_schema_1 = __importDefault(require("./src/contract-schema")); const request_1 = __importDefault(require("./src/fetcher/request")); const cheerio = __importStar(require("cheerio")); class Scraper { constructor(url, contract, attributes = {}, puppeteerOptions) { this.defaultAttributes = { 'background-image': background_image_1.default, link: link_1.default, number: number_1.default, size: size_1.default, text: text_1.default, }; this.url = url; this.contract = contract; this.attributes = attributes; this.puppeteerOptions = puppeteerOptions; } scrapePage() { const attributes = this.getAttributes(); const { message } = this.contractIsValid(attributes); if (!this.urlIsValid()) { throw Error(`The URL "${this.url}" you have provided is invalid`); } if (message) { throw message; } const fetcher = this.getFetcher(); return fetcher.getPage().then((page) => { return this.getScrapedItems(page, attributes); }); } getPageContents() { return __awaiter(this, void 0, void 0, function* () { const attributes = this.getAttributes(); const { message } = this.contractIsValid(attributes); if (!this.urlIsValid()) { throw Error(`The URL "${this.url}" you have provided is invalid`); } if (message) { throw message; } const fetcher = this.getFetcher(); const page = yield fetcher.getPage(); return { page, $: cheerio.load(page.contents), }; }); } getScrapedItems(page, attributes) { return this.getProvider(page, attributes).getScrapedItems(); } getAttributes() { return Object.assign(this.defaultAttributes, this.attributes); } urlIsValid() { try { new URL(this.url); return true; } catch (e) { return false; } } contractIsValid(attributes) { if (this.contract === null || this.contract === undefined) { return { message: 'Your contract is invalid, please check the specifications', }; } const schema = (0, contract_schema_1.default)(Object.keys(attributes)); const { error } = schema.validate(this.contract); return { message: error }; } getFetcher() { if (this.contract.puppeteer === true) { return new puppeteer_1.default(this.url, this.contract.waitForPageLoadSelector, this.puppeteerOptions); } return new request_1.default(this.url); } getProvider(page, attributes) { if (this.contract.scriptTagSelector) { return new script_tag_1.default(page, this.contract, attributes); } return new html_1.default(page, this.contract, attributes); } } exports.default = Scraper;