contract-scraper
Version:
A customisable data scraper for the web based on JSON contracts
134 lines (133 loc) • 5.59 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const background_image_1 = __importDefault(require("./src/attribute/background-image"));
const link_1 = __importDefault(require("./src/attribute/link"));
const number_1 = __importDefault(require("./src/attribute/number"));
const size_1 = __importDefault(require("./src/attribute/size"));
const text_1 = __importDefault(require("./src/attribute/text"));
const puppeteer_1 = __importDefault(require("./src/fetcher/puppeteer"));
const html_1 = __importDefault(require("./src/provider/html"));
const script_tag_1 = __importDefault(require("./src/provider/script-tag"));
const contract_schema_1 = __importDefault(require("./src/contract-schema"));
const request_1 = __importDefault(require("./src/fetcher/request"));
const cheerio = __importStar(require("cheerio"));
class Scraper {
constructor(url, contract, attributes = {}, puppeteerOptions) {
this.defaultAttributes = {
'background-image': background_image_1.default,
link: link_1.default,
number: number_1.default,
size: size_1.default,
text: text_1.default,
};
this.url = url;
this.contract = contract;
this.attributes = attributes;
this.puppeteerOptions = puppeteerOptions;
}
scrapePage() {
const attributes = this.getAttributes();
const { message } = this.contractIsValid(attributes);
if (!this.urlIsValid()) {
throw Error(`The URL "${this.url}" you have provided is invalid`);
}
if (message) {
throw message;
}
const fetcher = this.getFetcher();
return fetcher.getPage().then((page) => {
return this.getScrapedItems(page, attributes);
});
}
getPageContents() {
return __awaiter(this, void 0, void 0, function* () {
const attributes = this.getAttributes();
const { message } = this.contractIsValid(attributes);
if (!this.urlIsValid()) {
throw Error(`The URL "${this.url}" you have provided is invalid`);
}
if (message) {
throw message;
}
const fetcher = this.getFetcher();
const page = yield fetcher.getPage();
return {
page,
$: cheerio.load(page.contents),
};
});
}
getScrapedItems(page, attributes) {
return this.getProvider(page, attributes).getScrapedItems();
}
getAttributes() {
return Object.assign(this.defaultAttributes, this.attributes);
}
urlIsValid() {
try {
new URL(this.url);
return true;
}
catch (e) {
return false;
}
}
contractIsValid(attributes) {
if (this.contract === null || this.contract === undefined) {
return {
message: 'Your contract is invalid, please check the specifications',
};
}
const schema = (0, contract_schema_1.default)(Object.keys(attributes));
const { error } = schema.validate(this.contract);
return { message: error };
}
getFetcher() {
if (this.contract.puppeteer === true) {
return new puppeteer_1.default(this.url, this.contract.waitForPageLoadSelector, this.puppeteerOptions);
}
return new request_1.default(this.url);
}
getProvider(page, attributes) {
if (this.contract.scriptTagSelector) {
return new script_tag_1.default(page, this.contract, attributes);
}
return new html_1.default(page, this.contract, attributes);
}
}
exports.default = Scraper;