ayakashi
Version:
The next generation web scraping framework
206 lines (205 loc) • 9.79 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.attachExtract = void 0;
const opLog_1 = require("../../opLog/opLog");
const util_1 = require("util");
function attachExtract(ayakashiInstance) {
ayakashiInstance.extract = function (propId, extractable = "text") {
return __awaiter(this, void 0, void 0, function* () {
const prop = this.prop(propId);
if (!prop)
throw new Error(`<extract> needs a valid prop`);
const matchCount = yield prop.trigger();
if (matchCount === 0)
return [];
const results = yield recursiveExtract(ayakashiInstance, extractable, prop);
return results.map(result => result.result);
});
};
ayakashiInstance.extractFirst = function (propId, extractable = "text") {
return __awaiter(this, void 0, void 0, function* () {
const prop = this.prop(propId);
if (!prop)
throw new Error(`<extractFirst> needs a valid prop`);
const matchCount = yield prop.trigger();
if (matchCount === 0)
return null;
const results = yield recursiveExtract(ayakashiInstance, extractable, prop, 1);
return results[0].result;
});
};
ayakashiInstance.extractLast = function (propId, extractable = "text") {
return __awaiter(this, void 0, void 0, function* () {
const prop = this.prop(propId);
if (!prop)
throw new Error(`<extractLast> needs a valid prop`);
const matchCount = yield prop.trigger();
if (matchCount === 0)
return null;
const results = yield recursiveExtract(ayakashiInstance, extractable, prop, -1);
return results[0].result;
});
};
}
exports.attachExtract = attachExtract;
function recursiveExtract(ayakashiInstance, extractable, prop, matchPointer) {
return __awaiter(this, void 0, void 0, function* () {
const opLog = opLog_1.getOpLog();
if (typeof extractable === "string") {
if (extractable in ayakashiInstance.extractors) {
yield ayakashiInstance.extractors[extractable]();
return ayakashiInstance.evaluate(function (scopedPropId, scopedExtractorName, pointer) {
//@ts-ignore
const extractor = this.extractors[scopedExtractorName]();
let matches = this.propTable[scopedPropId].matches;
if (pointer === 1) {
matches = [matches[0]];
}
if (pointer === -1) {
matches = [matches[matches.length - 1]];
}
return matches.map(function (match) {
const result = extractor.extract(match);
if (extractor.isValid(result) && result !== undefined) {
return { result: result, isDefault: false };
}
else {
let def = extractor.useDefault();
if (def === undefined) {
def = "";
}
return { result: def, isDefault: true };
}
});
}, prop.id, extractable, matchPointer);
}
else {
return ayakashiInstance.evaluate(function (scopedPropId, attr, pointer) {
let matches = this.propTable[scopedPropId].matches;
if (pointer === 1) {
matches = [matches[0]];
}
if (pointer === -1) {
matches = [matches[matches.length - 1]];
}
function formatDataAttribute(dataAttr) {
const myDataAttr = dataAttr.replace("data-", "");
const formatted = [];
let upcased = false;
for (let i = 0; i < myDataAttr.length; i += 1) {
if (upcased) {
upcased = false;
continue;
}
if (myDataAttr[i] === "-" && (!myDataAttr[i + 1] || myDataAttr[i + 1] !== "-")) {
if (myDataAttr[i + 1]) {
formatted.push(myDataAttr[i + 1].toUpperCase());
}
else {
formatted.push("-");
}
upcased = true;
}
else {
formatted.push(myDataAttr[i]);
}
}
return formatted.join("");
}
return matches.map(function (match) {
try {
let myAttr = attr;
if (myAttr.includes("data-")) {
myAttr = formatDataAttribute(attr);
}
//@ts-ignore
if (match[myAttr]) {
//@ts-ignore
return { result: match[myAttr], isDefault: false };
}
else if (match.getAttribute(myAttr)) {
return { result: match.getAttribute(myAttr), isDefault: false };
}
else if (match.dataset && match.dataset[myAttr]) {
return { result: match.dataset[myAttr], isDefault: false };
}
else {
return { result: "", isDefault: true };
}
}
catch (_e) {
return { result: "", isDefault: true };
}
});
}, prop.id, extractable, matchPointer);
}
}
else if (Array.isArray(extractable)) {
const matchResults = yield recursiveExtract(ayakashiInstance, extractable[0], prop, matchPointer);
return matchResults.map(function (matchResult) {
if (matchResult.isDefault) {
if (extractable.length > 1) {
return { result: extractable[1], isDefault: true };
}
else {
return { result: matchResult.result, isDefault: false };
}
}
else {
return { result: matchResult.result, isDefault: false };
}
});
}
else if (typeof extractable === "function") {
return ayakashiInstance.evaluate(function (scopedPropId, fn, pointer) {
let matches = this.propTable[scopedPropId].matches;
if (pointer === 1) {
matches = [matches[0]];
}
if (pointer === -1) {
matches = [matches[matches.length - 1]];
}
return matches.map(function (match, index) {
return { result: fn(match, index), isDefault: false };
});
}, prop.id, extractable, matchPointer);
}
else if (util_1.isRegExp(extractable)) {
return ayakashiInstance.evaluate(function (scopedPropId, regex, pointer) {
let matches = this.propTable[scopedPropId].matches;
if (pointer === 1) {
matches = [matches[0]];
}
if (pointer === -1) {
matches = [matches[matches.length - 1]];
}
return matches.map(function (match) {
let regexResult = "";
if (match.textContent) {
const regexMatch = match.textContent.match(regex);
if (regexMatch && regexMatch[0]) {
regexResult = regexMatch[0];
}
}
return { result: regexResult, isDefault: regexResult === "" };
});
}, prop.id, extractable, matchPointer);
}
else {
if (typeof extractable === "object" && extractable !== null) {
opLog.warn("Nested or multiple extractions per prop are deprecated");
opLog.warn("Learn more here: https://ayakashi-io.github.io/docs/guide/data-extraction.html");
}
throw new Error("Invalid extractable");
}
});
}