contract-scraper
Version:
A customisable data scraper for the web based on JSON contracts
86 lines (85 loc) • 3.71 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const cheerio = __importStar(require("cheerio"));
const cheerio_eq_1 = __importDefault(require("cheerio-eq"));
const lodash_get_1 = __importDefault(require("lodash.get"));
class ScriptTagProvider {
constructor(page, contract, attributes) {
this.page = page;
this.contract = contract;
this.attributes = attributes;
this.$ = cheerio.load(this.page.contents);
this.contents = this.parseScriptTagContents();
}
mapElementToProperty(item, options) {
const { type, selector } = options;
const value = (0, lodash_get_1.default)(item, selector);
const attributeType = this.attributes[type];
if (attributeType === undefined) {
throw Error(`The attribute type ${type} isn't defined, did you pass it to the scraper?`);
}
return attributeType(value, this.page.url);
}
parseScriptTagContents() {
const { scriptTagSelector } = this.contract;
const contents = (0, cheerio_eq_1.default)(this.$, scriptTagSelector).html();
if (!contents || contents.length === 0)
return;
return JSON.parse(contents);
}
getScrapedItems() {
const scrapedItems = [];
let items = (0, lodash_get_1.default)(this.contents, this.contract.itemSelector) || this.contents;
if (!Array.isArray(items)) {
items = [items];
}
items.forEach((item) => {
const scrapedItem = {};
Object.entries(this.contract.attributes).forEach(([name, options]) => {
if (options.itemSelector !== undefined) {
const childElements = (0, lodash_get_1.default)(item, options.itemSelector) || [];
scrapedItem[name] = [];
childElements.forEach((childElement) => {
const childValues = {};
Object.entries(options.attributes).forEach(([childName, childOptions]) => {
childValues[childName] = this.mapElementToProperty(childElement, childOptions);
});
scrapedItem[name].push(childValues);
});
}
else {
scrapedItem[name] = this.mapElementToProperty(item, options);
}
});
scrapedItems.push(scrapedItem);
});
return scrapedItems;
}
}
exports.default = ScriptTagProvider;