UNPKG

contract-scraper

Version:

A customisable data scraper for the web based on JSON contracts

103 lines (102 loc) 4.11 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; Object.defineProperty(exports, "__esModule", { value: true }); const cheerio = __importStar(require("cheerio")); class HTMLProvider { constructor(page, contract, attributes) { this.page = page; this.contract = contract; this.attributes = attributes; this.$ = cheerio.load(this.page.contents); } getItemOrParentElement(item, selector) { if (selector !== undefined) { return this.$(selector, item); } return this.$(item); } getElementValue(element, attribute, raw) { if (attribute !== undefined) { const value = this.$(element).attr(attribute); return (value ? value.trim() : null); } if (raw === true) { return this.$(element).html(); } return this.$(element).text().trim(); } getElementDataAttributeKeyValue(element, { name, key }) { const value = this.$(element).data(name); if (name !== undefined && key !== undefined) { try { return value[key]; } catch (e) { return null; } } return value; } mapElementToProperty(item, options) { const { type, selector, attribute, data, raw } = options; const element = this.getItemOrParentElement(item, selector); const value = this.getElementValue(element, attribute, raw); if (data !== undefined) { // TODO parse by attribute type return this.getElementDataAttributeKeyValue(element, data); } const getValueAsAttribute = this.attributes[type]; if (getValueAsAttribute === undefined) { throw Error(`The attribute type ${type} isn't defined, did you pass it to the scraper?`); } return getValueAsAttribute(value, this.page.url); } getScrapedItems() { const elements = this.$(this.contract.itemSelector).toArray(); const scrapedItems = []; elements.forEach((element) => { const scrapedItem = {}; Object.entries(this.contract.attributes).forEach(([name, options]) => { if (options.itemSelector !== undefined) { const childElements = this.$(options.itemSelector, element).toArray(); scrapedItem[name] = []; childElements.forEach((childElement) => { const childValues = {}; Object.entries(options.attributes).forEach(([childName, childOptions]) => { childValues[childName] = this.mapElementToProperty(childElement, childOptions); }); scrapedItem[name].push(childValues); }); } else { scrapedItem[name] = this.mapElementToProperty(element, options); } }); scrapedItems.push(scrapedItem); }); return scrapedItems; } } exports.default = HTMLProvider;