@skypilot/scraper
Version:
Node-based scriptable web scraper
888 lines (722 loc) • 24.7 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.PlaywrightScraper = void 0;
var _sugarbowl = require("@skypilot/sugarbowl");
var _playwright = _interopRequireDefault(require("playwright"));
var _sliceArray = require("../../lib/array/sliceArray");
var _pluralize = require("../../lib/string/pluralize");
var _normalizeQuery = require("../../scraper/normalizeQuery");
var _normalizeQueryDict = require("../../scraper/normalizeQueryDict");
var _State = require("../../scraper/State");
var _Logger = require("../../utils/Logger");
var _readConfigs = require("../../utils/readConfigs");
var _resolveHref = require("../../utils/resolveHref");
var _getAttribute = require("./getAttribute");
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); keys.push.apply(keys, symbols); } return keys; }
function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(Object(source), true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; }
function _objectWithoutProperties(source, excluded) { if (source == null) return {}; var target = _objectWithoutPropertiesLoose(source, excluded); var key, i; if (Object.getOwnPropertySymbols) { var sourceSymbolKeys = Object.getOwnPropertySymbols(source); for (i = 0; i < sourceSymbolKeys.length; i++) { key = sourceSymbolKeys[i]; if (excluded.indexOf(key) >= 0) continue; if (!Object.prototype.propertyIsEnumerable.call(source, key)) continue; target[key] = source[key]; } } return target; }
function _objectWithoutPropertiesLoose(source, excluded) { if (source == null) return {}; var target = {}; var sourceKeys = Object.keys(source); var key, i; for (i = 0; i < sourceKeys.length; i++) { key = sourceKeys[i]; if (excluded.indexOf(key) >= 0) continue; target[key] = source[key]; } return target; }
function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; }
const defaultWaitUntil = 'networkidle';
const defaultRetryLimit = (0, _readConfigs.readConfigs)('retryLimit', {
defaultValue: 0
});
const delayInMs = (0, _readConfigs.readConfigs)('delayInMs', {
defaultValue: 0
});
const headless = (0, _readConfigs.readConfigs)('headless', {
defaultValue: true
});
const userAgent = (0, _readConfigs.readConfigs)('userAgents.default', {
required: true
});
const attributeSelectorPattern = /\/@([A-Za-z]+)[^/]$/;
const textSelectorPattern = /(\/|::)text\(\)(\[[0-9]+])?$/;
class PlaywrightScraper {
constructor(options = {}) {
_defineProperty(this, "browser", null);
_defineProperty(this, "browserOptions", {});
_defineProperty(this, "context", null);
_defineProperty(this, "logger", void 0);
_defineProperty(this, "database", null);
_defineProperty(this, "pages", []);
_defineProperty(this, "runLevel", 0);
_defineProperty(this, "state", new _State.State());
_defineProperty(this, "variableDelayInMs", void 0);
const {
database,
logDir,
logFileName,
variableDelayInMs = 0,
verbose
} = options,
browserOptions = _objectWithoutProperties(options, ["database", "logDir", "logFileName", "variableDelayInMs", "verbose"]);
if (database) {
this.database = database;
}
this.variableDelayInMs = variableDelayInMs;
this.browserOptions = _objectSpread({
delayInMs,
headless,
userAgent
}, browserOptions);
this.logger = new _Logger.Logger({
logDir,
logFileName,
verbose
});
}
get currentPage() {
return this.pages[this.pages.length - 1];
}
addToLog(message, options = {}) {
this.logger.add(message, _objectSpread(_objectSpread({}, options), {}, {
runLevel: this.runLevel
}));
}
clearState() {
this.state.clear();
}
async click(query, options = {}) {
const {
baseRef = this.currentPage,
sel
} = query;
if (!baseRef) {
throw new Error('No baseRef');
}
const clickOptions = _objectSpread({
delay: 25,
noWaitAfter: true
}, options);
const {
throwOnWaitTimeout = false,
waitTimeoutInMs = 30000
} = options;
this.addToLog(`click: waiting for selector: '${sel}'`);
await baseRef.waitForSelector(sel, {
timeout: waitTimeoutInMs
}).then(() => {
this.addToLog(`click: clicking selector: '${sel}'`);
return baseRef.click(sel, clickOptions);
}).catch(error => {
this.addToLog(`Element to be clicked did not appear after ${waitTimeoutInMs}ms`);
if (throwOnWaitTimeout) {
throw error;
}
});
}
async close() {
var _this$context, _this$browser;
await this.closeAllPages();
await ((_this$context = this.context) === null || _this$context === void 0 ? void 0 : _this$context.close());
await ((_this$browser = this.browser) === null || _this$browser === void 0 ? void 0 : _this$browser.close());
this.pages = [];
this.context = null;
this.browser = null;
}
async closeAllPages() {
while (this.pages.length > 0) {
await this.closeLatestPage();
}
}
async closeLatestPage() {
var _this$pages$pop;
await ((_this$pages$pop = this.pages.pop()) === null || _this$pages$pop === void 0 ? void 0 : _this$pages$pop.close());
}
async count(query) {
var _this$currentPage;
const normalizedQuery = (0, _normalizeQuery.normalizeQuery)(query);
const {
sel,
limit,
slice
} = normalizedQuery;
const handles = (await ((_this$currentPage = this.currentPage) === null || _this$currentPage === void 0 ? void 0 : _this$currentPage.$$(sel))) || [];
return (0, _sliceArray.sliceArray)(handles, slice || limit).length;
}
async follow(query, navOptions = {}) {
if (!this.currentPage) {
throw new Error('No page is open');
}
const {
sel
} = query;
const href = await this.getAttribute(_objectSpread(_objectSpread({}, query), {}, {
attr: 'href'
}));
if (!href) {
throw new Error(`No match for 'follow' query selector: '${sel}@href'`);
}
this.addToLog(`follow: ${sel}/="${href}"`);
const documentHref = await this.currentPage.evaluate(() => document.location.href);
return this.goTo((0, _resolveHref.resolveHref)(documentHref, href), navOptions);
}
async get(query) {
const {
attr,
scope = 'one'
} = query;
if (attributeSelectorPattern.test(query.sel) || textSelectorPattern.test(query.sel)) {
return scope === 'one' ? this.getTextValue(query) : this.getTextValueAll(query);
}
if (scope === 'one') {
return attr ? this.getAttribute(query) : this.getTextContent(query);
}
return attr ? this.getAttributeAll(query) : this.getTextContentAll(query);
}
async getContext() {
var _this$browser2, _this$currentPage2, _this$currentPage3;
return {
browser: {
version: await ((_this$browser2 = this.browser) === null || _this$browser2 === void 0 ? void 0 : _this$browser2.version())
},
page: {
url: await ((_this$currentPage2 = this.currentPage) === null || _this$currentPage2 === void 0 ? void 0 : _this$currentPage2.url()),
title: await ((_this$currentPage3 = this.currentPage) === null || _this$currentPage3 === void 0 ? void 0 : _this$currentPage3.title())
}
};
}
async getPageContent() {
var _this$currentPage4;
return (await ((_this$currentPage4 = this.currentPage) === null || _this$currentPage4 === void 0 ? void 0 : _this$currentPage4.content())) || null;
}
getState() {
return this.state.get();
}
async getAttribute(query) {
const {
attr,
noTrim = false,
transform
} = query;
if (!attr) {
throw new Error('attr (attribute name) cannot be empty');
}
const elementRef = await this.select(query);
if (!elementRef) {
return null;
}
if (attr === 'outerHTML') {
return this.getOuterHtml(query);
}
const attributeValue = await (0, _getAttribute.getAttribute)(elementRef, attr, {
noTrim
});
return transform ? transform(attributeValue, await this.getContext()) : attributeValue;
}
async getAttributeAll(query) {
const {
attr,
noTrim = false,
transform
} = query;
if (!attr) {
throw new Error('attr (attribute name) cannot be empty');
}
if (attr === 'outerHTML') {
return this.getOuterHtmlAll(query);
}
const handles = await this.selectAll(query);
const attributeValues = await Promise.all(handles.map(handle => (0, _getAttribute.getAttribute)(handle, attr, {
noTrim
})));
return transform ? await Promise.all(attributeValues.map(async attributeValue => transform(attributeValue, await this.getContext()))) : attributeValues;
}
async getOuterHtml(query) {
const {
baseRef = this.currentPage,
sel,
transform
} = query;
const outerHtml = (await (baseRef === null || baseRef === void 0 ? void 0 : baseRef.$eval(sel, element => element.outerHTML))) || null;
return transform ? transform(outerHtml, await this.getContext()) : outerHtml;
}
async getOuterHtmlAll(query) {
const {
baseRef = this.currentPage,
limit,
sel,
slice,
transform
} = query;
const outerHtmlContents = (await (baseRef === null || baseRef === void 0 ? void 0 : baseRef.$$eval(sel, elements => elements.map(element => element.outerHTML)))) || [];
const slicedContents = (0, _sliceArray.sliceArray)(outerHtmlContents, slice || limit);
return transform ? await Promise.all(slicedContents.map(async content => transform(content, await this.getContext()))) : slicedContents;
}
async getTextContent(query) {
const {
noTrim = false,
transform
} = query;
if (!this.currentPage) {
return null;
}
const elementRef = await this.select(query);
if (!elementRef) {
return null;
}
const textContent = await elementRef.textContent();
const trimmedContent = !textContent || noTrim ? textContent : textContent.trim();
return transform ? transform(trimmedContent, await this.getContext()) : trimmedContent;
}
async getTextContentAll(query) {
const {
noTrim,
transform
} = query;
const handles = await this.selectAll(query);
const textContents = await Promise.all(handles.map(async handle => await handle.textContent()));
const trimmedContents = textContents.map(textContent => !textContent || noTrim ? textContent : textContent.trim());
return transform ? await Promise.all(trimmedContents.map(async content => transform(content, await this.getContext()))) : trimmedContents;
}
async getTextValue(query) {
var _this$currentPage5;
const {
noTrim = false,
transform
} = query;
if (!this.currentPage) {
throw new Error('No page is open');
}
const result = await ((_this$currentPage5 = this.currentPage) === null || _this$currentPage5 === void 0 ? void 0 : _this$currentPage5.evaluate(selector => document.evaluate(selector, document, null, XPathResult.STRING_TYPE, null).stringValue, query.sel));
if ((0, _sugarbowl.isDefinite)(result)) {
const trimmedValue = noTrim ? result : result.trim();
return transform ? transform(trimmedValue, await this.getContext()) : trimmedValue;
}
return null;
}
async getTextValueAll(query) {
const {
noTrim = false,
transform
} = query;
if (!this.currentPage) {
throw new Error('No page is open');
}
const textValues = await this.currentPage.evaluate(selector => {
const xPathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
if (!xPathResult) {
return [];
}
const values = [];
let node;
while (node = xPathResult.iterateNext()) {
if (node instanceof Attr) {
values.push(node.value);
}
if (node instanceof Text) {
values.push(node.wholeText);
}
}
return values;
}, query.sel);
const trimmedValues = noTrim ? textValues : textValues.map(_sugarbowl.trim);
return transform ? Promise.all(trimmedValues.map(async value => transform(value, await this.getContext()))) : trimmedValues;
}
async goTo(url, options = {}) {
const {
addUrlToState,
newPage,
retryLimit = defaultRetryLimit,
state = this.state,
validate,
verbose = false,
waitUntil = defaultWaitUntil
} = options;
this.addToLog(`goTo: '${url}'`);
if (validate) {
const {
messages = []
} = validate(url) || {};
if (messages.length > 0) {
this.addToLog(messages, {
prefix: 'skipping: '
});
return {
page: {}
};
}
}
if (this.currentPage && this.variableDelayInMs) {
await (0, _sugarbowl.sleep)((0, _sugarbowl.generateRandomInt)(0, this.variableDelayInMs));
}
const page = !newPage && this.currentPage || (await this.newPage());
if (!page) {
throw new Error('No page is open');
}
let done = false;
let attemptNumber = 0;
const maxAttempts = retryLimit + 1;
while (!done && attemptNumber <= maxAttempts) {
attemptNumber += 1;
(0, _sugarbowl.consoleIf)(verbose)([`Navigating to URL: ${url}`, ...(0, _sugarbowl.includeIf)(attemptNumber > 1, `(attempt ${attemptNumber})`)].join(' '));
await page.goto(url, {
waitUntil
}).then(async () => {
done = true;
const context = await this.getContext();
if (context.page.url === 'chrome-error://chromewebdata/') {
throw new Error('Browser error');
}
done = true;
}).catch(error => {
this.addToLog({
code: error.code,
message: error.message.includes('\n') ? error.message.split('\n') : error.message,
operation: 'goTo',
url,
attemptNumber
}, {
prefix: 'warning: '
});
if (attemptNumber === maxAttempts) {
throw new Error(['Aborted after', (0, _sugarbowl.inflectQuantity)(attemptNumber, 'attempt')].join(' '));
}
});
}
const actualUrl = (await this.getContext()).page.url;
if (addUrlToState) {
state.update({
url: actualUrl
});
}
if (actualUrl !== url) {
(0, _sugarbowl.consoleIf)(verbose)(`Redirected to ${actualUrl}`);
this.addToLog(`[redirected to '${actualUrl}']`);
}
return this.getContext();
}
async has(query) {
var _this$currentPage6;
const {
sel,
nth
} = query;
if (nth) {
return (await this.count(query)) >= nth;
}
return !!(await ((_this$currentPage6 = this.currentPage) === null || _this$currentPage6 === void 0 ? void 0 : _this$currentPage6.$(sel)));
}
isConnected() {
return !!(this.context && this.browser && this.browser.isConnected());
}
async launch() {
if (this.browser) {
throw new Error('The browser has already been launched');
}
const {
delayInMs: slowMo,
headless,
userAgent
} = this.browserOptions;
this.browser = await _playwright.default.chromium.launch({
headless,
slowMo
});
this.context = await this.browser.newContext({
userAgent
});
return {
version: this.browser.version()
};
}
async newPage() {
if (!this.context) {
await this.launch();
}
const page = await this.context.newPage();
this.pages.push(page);
return page;
}
async query(queryDict, options = {}) {
const {
baseRef,
state = this.state,
statePath,
transform,
updateState
} = options;
this.addToLog(queryDict, {
prefix: 'query'
});
const queryResult = await (async () => {
let results = {};
for (const queryEntry of Object.entries((0, _normalizeQueryDict.normalizeQueryDict)(queryDict))) {
const [key, query] = queryEntry;
const result = await this.get((0, _sugarbowl.omitUndefined)(_objectSpread(_objectSpread({}, query), {}, {
baseRef
})));
results = _objectSpread(_objectSpread({}, results), {}, {
[key]: result
});
}
return transform ? transform(results, await this.getContext()) : results;
})();
this.addToLog(queryResult, {
prefix: 'result'
});
if (updateState) {
state.update(queryResult, {
statePath
});
}
return queryResult;
}
async run(commandsOrBuilder, options = {}) {
const {
collectionName = 'records',
displayLog,
nth,
retryLimit = 2,
state = this.state,
statePath
} = options;
const commands = commandsOrBuilder instanceof Array ? commandsOrBuilder : commandsOrBuilder.commands;
let {
baseRef
} = options;
let pagesToCloseCount = 0;
let stopRun = false;
this.runLevel += 1;
if (nth) {
this.addToLog(`- run ${nth}`);
}
for (const command of commands) {
if (stopRun) {
break;
}
switch (command.action) {
case 'click':
await this.click(command.query, command.options);
break;
case 'follow':
{
const {
query,
options: navOptions = {}
} = command;
const pageContext = await this.follow(_objectSpread({
baseRef
}, query), _objectSpread({
addUrlToState: true,
newPage: true,
retryLimit,
state
}, navOptions)).then(context => {
if (!context.page.url) {
stopRun = true;
}
return context;
}).catch(error => {
this.addToLog({
operation: 'follow',
query,
code: error.code,
message: error.message.includes('\n') ? error.message.split('\n') : error.message,
retryLimit
}, {
prefix: 'warning: '
});
stopRun = true;
return this.getContext();
});
if (pageContext.page.url) {
pagesToCloseCount += 1;
baseRef = undefined;
}
break;
}
case 'goTo':
{
var _pageContext$page;
const {
url,
options: navOptions = {}
} = command;
const mergedNavOptions = _objectSpread({
addUrlToState: true,
newPage: true,
retryLimit
}, navOptions);
const pageContext = await this.goTo(url, mergedNavOptions).then(context => {
if (!context.page.url) {
stopRun = true;
}
return context;
}).catch(error => {
this.addToLog({
operation: 'goTo',
url,
code: error.code,
message: error.message.includes('\n') ? error.message.split('\n') : error.message,
retryLimit
}, {
prefix: 'warning: '
});
stopRun = true;
return this.getContext();
});
if ((_pageContext$page = pageContext.page) !== null && _pageContext$page !== void 0 && _pageContext$page.url) {
pagesToCloseCount += 1;
baseRef = undefined;
}
break;
}
case 'query':
{
const {
queryDict,
options: queryOptions = {}
} = command;
await this.query(queryDict, (0, _sugarbowl.omitUndefined)(_objectSpread({
baseRef,
state,
statePath,
updateState: true
}, queryOptions)));
break;
}
case 'runOnAll':
{
const {
query,
commands,
options: runOnAllOptions
} = command;
await this.runOnAll(query, commands, (0, _sugarbowl.omitUndefined)(_objectSpread({
baseRef,
collectionName,
state
}, runOnAllOptions)));
break;
}
case 'set':
{
await state.update(command.state, {
statePath
});
break;
}
case 'write':
await this.write(command.collectionName || collectionName, {
state
});
break;
default:
{
const {
action
} = command;
throw new Error(`Unrecognized action: ${action}`);
}
}
}
while (pagesToCloseCount > 0) {
await this.closeLatestPage();
pagesToCloseCount -= 1;
}
this.runLevel -= 1;
if (this.runLevel === 0) {
if (displayLog) {
this.logger.display();
}
this.logger.write();
}
return state.get();
}
async runOnAll(query, commands, options = {}) {
const {
baseRef,
collectionName = 'records',
state = this.state
} = options;
const baseRefs = await this.selectAll(_objectSpread(_objectSpread({}, query), {}, {
baseRef
}));
this.addToLog(['runOnAll:', `selector: '${query.sel}',`, `${(0, _pluralize.pluralize)(baseRefs.length, 'match', 'matches')}`, ...(0, _sugarbowl.includeIf)(query.limit, `(max: ${query.limit})`), ...(0, _sugarbowl.includeIf)(query.slice, `(slice: ${query.slice})`)].join(' '));
const records = [];
for (const baseRef of baseRefs) {
const recordState = state.clone();
const nth = baseRefs.indexOf(baseRef) + 1;
await this.run(commands, {
baseRef,
collectionName,
nth,
state: recordState
});
records.push(recordState.get());
}
state.update({
[collectionName]: records
});
return records;
}
async select(query) {
if (!query.sel) {
if (query.baseRef) {
if (query.nth && query.nth > 1) {
throw new Error('nth cannot be set when no selector is given');
}
return query.baseRef;
}
}
const {
baseRef = this.currentPage,
sel,
nth = 1
} = query;
if (!baseRef || nth < 1 || !(0, _sugarbowl.isInteger)(nth)) {
return null;
}
if (nth === 1) {
return baseRef.$(sel);
}
const elementRef = (await baseRef.$$(sel)) || [];
if (elementRef.length < nth) {
return null;
}
return elementRef[nth - 1];
}
async selectAll(query) {
const {
baseRef = this.currentPage,
sel,
limit,
slice
} = query;
const refs = (await (baseRef === null || baseRef === void 0 ? void 0 : baseRef.$$(sel))) || [];
return (0, _sliceArray.sliceArray)(refs, slice || limit);
}
updateState(data, options = {}) {
const {
state = this.state,
statePath
} = options;
return state.update(data, {
statePath
});
}
async write(collectionName, options = {}) {
const {
state = this.state
} = options;
if (!this.database) {
throw new Error('No database has been declared');
}
if (!collectionName) {
throw new Error('`collectionName` cannot be empty');
}
const stateData = state.get();
if (!state) {
throw new Error('There is no data to write');
}
return this.database.update(collectionName, _objectSpread({
retrievedAt: new Date().toISOString()
}, stateData));
}
}
exports.PlaywrightScraper = PlaywrightScraper;