hammer-scrape
Version:
Unifies Cheerio and Puppeteer for the most streamline scraping experience
219 lines • 6.79 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const request_group_cheerio_1 = require("request-group-cheerio");
const web_scraping_engine_1 = require("../web_scraping_engine");
const core_errors_1 = require("../core_errors");
/**
* A set of default configuration options to use for the cheerio parsing core
*/
exports.CHEERIO_PARSING_CORE_DEFAULT = {
xml: false,
header: undefined,
reinitialize: false,
html: '',
};
/**
* A basic cheerio parsing core.
* This is very likely the fasest core for parsing that's reliable
*/
class CheerioParsingCore extends web_scraping_engine_1.ParsingCore {
constructor(url) {
super(url);
this.request = null;
this.initialized = false;
}
dispose() {
return new Promise((resolve) => {
this.initialized = false;
resolve();
});
}
getRequest() {
if (this.isInitialized()) {
return this.request;
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
initialize(data = exports.CHEERIO_PARSING_CORE_DEFAULT) {
return new Promise((resolve) => {
if (this.isInitialized()) {
resolve();
}
else {
let userAgent = data && typeof data['header'] !== 'undefined' ? data.header : null;
this.request = new request_group_cheerio_1.CheerioRequest(this.getUrl(), userAgent);
this.request.run().then(() => {
this.core = this.request.getPage();
this.initialized = true;
resolve();
});
}
});
}
isInitialized() {
return this.initialized;
}
getText(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let result = this.raw()(querySelector)
.first()
.text()
.trim();
resolve(result);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getTextAll(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let result = [];
let $ = this.raw();
$(querySelector).each((index, element) => {
result.push($(element)
.text()
.trim());
});
resolve(result);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getAttribute(querySelector, attributeName) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let result = this.raw()(querySelector)
.first()
.attr(attributeName)
.trim();
resolve(result);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getAttributeAll(querySelector, attributeName) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let results = [];
let $ = this.raw();
$(querySelector).each((index, element) => {
results.push($(element)
.attr(attributeName)
.trim());
});
resolve(results);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getHtml(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let html = this.raw()(querySelector)
.first()
.html();
let result = html ? html.trim() : '';
resolve(result);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getHtmlAll(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let results = [];
let $ = this.raw();
$(querySelector).each((index, element) => {
let html = $(element).html();
results.push(html ? html.trim() : '');
});
resolve(results);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getSelectOptions(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let selectOptions = [];
let $ = this.raw();
$(querySelector)
.find(querySelector)
.each((index, element) => {
selectOptions.push({
text: $(element)
.text()
.trim(),
value: $(element).val(),
});
});
resolve(selectOptions);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
elementExist(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
this.elementCount(querySelector).then((totalCount) => {
resolve(totalCount > 0 ? true : false);
});
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
elementCount(querySelector) {
if (this.isInitialized()) {
return new Promise((resolve) => {
let totalCount = this.raw()(querySelector).length;
resolve(totalCount);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
raw() {
if (this.isInitialized()) {
return this.core;
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
getDocumentHtml() {
if (this.isInitialized()) {
return new Promise((resolve) => {
let html = this.raw()
.root()
.html();
resolve(html);
});
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
}
exports.CheerioParsingCore = CheerioParsingCore;
exports.default = CheerioParsingCore;
//# sourceMappingURL=cheerio_parsing.js.map