UNPKG

ayakashi

Version:

The next generation web scraping framework

206 lines (205 loc) 9.79 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.attachExtract = void 0; const opLog_1 = require("../../opLog/opLog"); const util_1 = require("util"); function attachExtract(ayakashiInstance) { ayakashiInstance.extract = function (propId, extractable = "text") { return __awaiter(this, void 0, void 0, function* () { const prop = this.prop(propId); if (!prop) throw new Error(`<extract> needs a valid prop`); const matchCount = yield prop.trigger(); if (matchCount === 0) return []; const results = yield recursiveExtract(ayakashiInstance, extractable, prop); return results.map(result => result.result); }); }; ayakashiInstance.extractFirst = function (propId, extractable = "text") { return __awaiter(this, void 0, void 0, function* () { const prop = this.prop(propId); if (!prop) throw new Error(`<extractFirst> needs a valid prop`); const matchCount = yield prop.trigger(); if (matchCount === 0) return null; const results = yield recursiveExtract(ayakashiInstance, extractable, prop, 1); return results[0].result; }); }; ayakashiInstance.extractLast = function (propId, extractable = "text") { return __awaiter(this, void 0, void 0, function* () { const prop = this.prop(propId); if (!prop) throw new Error(`<extractLast> needs a valid prop`); const matchCount = yield prop.trigger(); if (matchCount === 0) return null; const results = yield recursiveExtract(ayakashiInstance, extractable, prop, -1); return results[0].result; }); }; } exports.attachExtract = attachExtract; function recursiveExtract(ayakashiInstance, extractable, prop, matchPointer) { return __awaiter(this, void 0, void 0, function* () { const opLog = opLog_1.getOpLog(); if (typeof extractable === "string") { if (extractable in ayakashiInstance.extractors) { yield ayakashiInstance.extractors[extractable](); return ayakashiInstance.evaluate(function (scopedPropId, scopedExtractorName, pointer) { //@ts-ignore const extractor = this.extractors[scopedExtractorName](); let matches = this.propTable[scopedPropId].matches; if (pointer === 1) { matches = [matches[0]]; } if (pointer === -1) { matches = [matches[matches.length - 1]]; } return matches.map(function (match) { const result = extractor.extract(match); if (extractor.isValid(result) && result !== undefined) { return { result: result, isDefault: false }; } else { let def = extractor.useDefault(); if (def === undefined) { def = ""; } return { result: def, isDefault: true }; } }); }, prop.id, extractable, matchPointer); } else { return ayakashiInstance.evaluate(function (scopedPropId, attr, pointer) { let matches = this.propTable[scopedPropId].matches; if (pointer === 1) { matches = [matches[0]]; } if (pointer === -1) { matches = [matches[matches.length - 1]]; } function formatDataAttribute(dataAttr) { const myDataAttr = dataAttr.replace("data-", ""); const formatted = []; let upcased = false; for (let i = 0; i < myDataAttr.length; i += 1) { if (upcased) { upcased = false; continue; } if (myDataAttr[i] === "-" && (!myDataAttr[i + 1] || myDataAttr[i + 1] !== "-")) { if (myDataAttr[i + 1]) { formatted.push(myDataAttr[i + 1].toUpperCase()); } else { formatted.push("-"); } upcased = true; } else { formatted.push(myDataAttr[i]); } } return formatted.join(""); } return matches.map(function (match) { try { let myAttr = attr; if (myAttr.includes("data-")) { myAttr = formatDataAttribute(attr); } //@ts-ignore if (match[myAttr]) { //@ts-ignore return { result: match[myAttr], isDefault: false }; } else if (match.getAttribute(myAttr)) { return { result: match.getAttribute(myAttr), isDefault: false }; } else if (match.dataset && match.dataset[myAttr]) { return { result: match.dataset[myAttr], isDefault: false }; } else { return { result: "", isDefault: true }; } } catch (_e) { return { result: "", isDefault: true }; } }); }, prop.id, extractable, matchPointer); } } else if (Array.isArray(extractable)) { const matchResults = yield recursiveExtract(ayakashiInstance, extractable[0], prop, matchPointer); return matchResults.map(function (matchResult) { if (matchResult.isDefault) { if (extractable.length > 1) { return { result: extractable[1], isDefault: true }; } else { return { result: matchResult.result, isDefault: false }; } } else { return { result: matchResult.result, isDefault: false }; } }); } else if (typeof extractable === "function") { return ayakashiInstance.evaluate(function (scopedPropId, fn, pointer) { let matches = this.propTable[scopedPropId].matches; if (pointer === 1) { matches = [matches[0]]; } if (pointer === -1) { matches = [matches[matches.length - 1]]; } return matches.map(function (match, index) { return { result: fn(match, index), isDefault: false }; }); }, prop.id, extractable, matchPointer); } else if (util_1.isRegExp(extractable)) { return ayakashiInstance.evaluate(function (scopedPropId, regex, pointer) { let matches = this.propTable[scopedPropId].matches; if (pointer === 1) { matches = [matches[0]]; } if (pointer === -1) { matches = [matches[matches.length - 1]]; } return matches.map(function (match) { let regexResult = ""; if (match.textContent) { const regexMatch = match.textContent.match(regex); if (regexMatch && regexMatch[0]) { regexResult = regexMatch[0]; } } return { result: regexResult, isDefault: regexResult === "" }; }); }, prop.id, extractable, matchPointer); } else { if (typeof extractable === "object" && extractable !== null) { opLog.warn("Nested or multiple extractions per prop are deprecated"); opLog.warn("Learn more here: https://ayakashi-io.github.io/docs/guide/data-extraction.html"); } throw new Error("Invalid extractable"); } }); }