contract-scraper
Version:
A customisable data scraper for the web based on JSON contracts
103 lines (102 loc) • 4.11 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
const cheerio = __importStar(require("cheerio"));
class HTMLProvider {
constructor(page, contract, attributes) {
this.page = page;
this.contract = contract;
this.attributes = attributes;
this.$ = cheerio.load(this.page.contents);
}
getItemOrParentElement(item, selector) {
if (selector !== undefined) {
return this.$(selector, item);
}
return this.$(item);
}
getElementValue(element, attribute, raw) {
if (attribute !== undefined) {
const value = this.$(element).attr(attribute);
return (value ? value.trim() : null);
}
if (raw === true) {
return this.$(element).html();
}
return this.$(element).text().trim();
}
getElementDataAttributeKeyValue(element, { name, key }) {
const value = this.$(element).data(name);
if (name !== undefined && key !== undefined) {
try {
return value[key];
}
catch (e) {
return null;
}
}
return value;
}
mapElementToProperty(item, options) {
const { type, selector, attribute, data, raw } = options;
const element = this.getItemOrParentElement(item, selector);
const value = this.getElementValue(element, attribute, raw);
if (data !== undefined) {
// TODO parse by attribute type
return this.getElementDataAttributeKeyValue(element, data);
}
const getValueAsAttribute = this.attributes[type];
if (getValueAsAttribute === undefined) {
throw Error(`The attribute type ${type} isn't defined, did you pass it to the scraper?`);
}
return getValueAsAttribute(value, this.page.url);
}
getScrapedItems() {
const elements = this.$(this.contract.itemSelector).toArray();
const scrapedItems = [];
elements.forEach((element) => {
const scrapedItem = {};
Object.entries(this.contract.attributes).forEach(([name, options]) => {
if (options.itemSelector !== undefined) {
const childElements = this.$(options.itemSelector, element).toArray();
scrapedItem[name] = [];
childElements.forEach((childElement) => {
const childValues = {};
Object.entries(options.attributes).forEach(([childName, childOptions]) => {
childValues[childName] = this.mapElementToProperty(childElement, childOptions);
});
scrapedItem[name].push(childValues);
});
}
else {
scrapedItem[name] = this.mapElementToProperty(element, options);
}
});
scrapedItems.push(scrapedItem);
});
return scrapedItems;
}
}
exports.default = HTMLProvider;