UNPKG

hammer-scrape

Version:

Unifies Cheerio and Puppeteer for the most streamline scraping experience

219 lines 6.79 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const request_group_cheerio_1 = require("request-group-cheerio"); const web_scraping_engine_1 = require("../web_scraping_engine"); const core_errors_1 = require("../core_errors"); /** * A set of default configuration options to use for the cheerio parsing core */ exports.CHEERIO_PARSING_CORE_DEFAULT = { xml: false, header: undefined, reinitialize: false, html: '', }; /** * A basic cheerio parsing core. * This is very likely the fasest core for parsing that's reliable */ class CheerioParsingCore extends web_scraping_engine_1.ParsingCore { constructor(url) { super(url); this.request = null; this.initialized = false; } dispose() { return new Promise((resolve) => { this.initialized = false; resolve(); }); } getRequest() { if (this.isInitialized()) { return this.request; } else { throw new core_errors_1.CoreNotInitializedError(); } } initialize(data = exports.CHEERIO_PARSING_CORE_DEFAULT) { return new Promise((resolve) => { if (this.isInitialized()) { resolve(); } else { let userAgent = data && typeof data['header'] !== 'undefined' ? data.header : null; this.request = new request_group_cheerio_1.CheerioRequest(this.getUrl(), userAgent); this.request.run().then(() => { this.core = this.request.getPage(); this.initialized = true; resolve(); }); } }); } isInitialized() { return this.initialized; } getText(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { let result = this.raw()(querySelector) .first() .text() .trim(); resolve(result); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } getTextAll(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { let result = []; let $ = this.raw(); $(querySelector).each((index, element) => { result.push($(element) .text() .trim()); }); resolve(result); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } getAttribute(querySelector, attributeName) { if (this.isInitialized()) { return new Promise((resolve) => { let result = this.raw()(querySelector) .first() .attr(attributeName) .trim(); resolve(result); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } getAttributeAll(querySelector, attributeName) { if (this.isInitialized()) { return new Promise((resolve) => { let results = []; let $ = this.raw(); $(querySelector).each((index, element) => { results.push($(element) .attr(attributeName) .trim()); }); resolve(results); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } getHtml(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { let html = this.raw()(querySelector) .first() .html(); let result = html ? html.trim() : ''; resolve(result); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } getHtmlAll(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { let results = []; let $ = this.raw(); $(querySelector).each((index, element) => { let html = $(element).html(); results.push(html ? html.trim() : ''); }); resolve(results); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } getSelectOptions(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { let selectOptions = []; let $ = this.raw(); $(querySelector) .find(querySelector) .each((index, element) => { selectOptions.push({ text: $(element) .text() .trim(), value: $(element).val(), }); }); resolve(selectOptions); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } elementExist(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { this.elementCount(querySelector).then((totalCount) => { resolve(totalCount > 0 ? true : false); }); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } elementCount(querySelector) { if (this.isInitialized()) { return new Promise((resolve) => { let totalCount = this.raw()(querySelector).length; resolve(totalCount); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } raw() { if (this.isInitialized()) { return this.core; } else { throw new core_errors_1.CoreNotInitializedError(); } } getDocumentHtml() { if (this.isInitialized()) { return new Promise((resolve) => { let html = this.raw() .root() .html(); resolve(html); }); } else { throw new core_errors_1.CoreNotInitializedError(); } } } exports.CheerioParsingCore = CheerioParsingCore; exports.default = CheerioParsingCore; //# sourceMappingURL=cheerio_parsing.js.map