UNPKG

contract-scraper

Version:

A customisable data scraper for the web based on JSON contracts

86 lines (85 loc) 3.71 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const cheerio = __importStar(require("cheerio")); const cheerio_eq_1 = __importDefault(require("cheerio-eq")); const lodash_get_1 = __importDefault(require("lodash.get")); class ScriptTagProvider { constructor(page, contract, attributes) { this.page = page; this.contract = contract; this.attributes = attributes; this.$ = cheerio.load(this.page.contents); this.contents = this.parseScriptTagContents(); } mapElementToProperty(item, options) { const { type, selector } = options; const value = (0, lodash_get_1.default)(item, selector); const attributeType = this.attributes[type]; if (attributeType === undefined) { throw Error(`The attribute type ${type} isn't defined, did you pass it to the scraper?`); } return attributeType(value, this.page.url); } parseScriptTagContents() { const { scriptTagSelector } = this.contract; const contents = (0, cheerio_eq_1.default)(this.$, scriptTagSelector).html(); if (!contents || contents.length === 0) return; return JSON.parse(contents); } getScrapedItems() { const scrapedItems = []; let items = (0, lodash_get_1.default)(this.contents, this.contract.itemSelector) || this.contents; if (!Array.isArray(items)) { items = [items]; } items.forEach((item) => { const scrapedItem = {}; Object.entries(this.contract.attributes).forEach(([name, options]) => { if (options.itemSelector !== undefined) { const childElements = (0, lodash_get_1.default)(item, options.itemSelector) || []; scrapedItem[name] = []; childElements.forEach((childElement) => { const childValues = {}; Object.entries(options.attributes).forEach(([childName, childOptions]) => { childValues[childName] = this.mapElementToProperty(childElement, childOptions); }); scrapedItem[name].push(childValues); }); } else { scrapedItem[name] = this.mapElementToProperty(item, options); } }); scrapedItems.push(scrapedItem); }); return scrapedItems; } } exports.default = ScriptTagProvider;