UNPKG

@skypilot/scraper

Version:
888 lines (722 loc) 24.7 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.PlaywrightScraper = void 0; var _sugarbowl = require("@skypilot/sugarbowl"); var _playwright = _interopRequireDefault(require("playwright")); var _sliceArray = require("../../lib/array/sliceArray"); var _pluralize = require("../../lib/string/pluralize"); var _normalizeQuery = require("../../scraper/normalizeQuery"); var _normalizeQueryDict = require("../../scraper/normalizeQueryDict"); var _State = require("../../scraper/State"); var _Logger = require("../../utils/Logger"); var _readConfigs = require("../../utils/readConfigs"); var _resolveHref = require("../../utils/resolveHref"); var _getAttribute = require("./getAttribute"); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); keys.push.apply(keys, symbols); } return keys; } function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(Object(source), true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; } function _objectWithoutProperties(source, excluded) { if (source == null) return {}; var target = _objectWithoutPropertiesLoose(source, excluded); var key, i; if (Object.getOwnPropertySymbols) { var sourceSymbolKeys = Object.getOwnPropertySymbols(source); for (i = 0; i < sourceSymbolKeys.length; i++) { key = sourceSymbolKeys[i]; if (excluded.indexOf(key) >= 0) continue; if (!Object.prototype.propertyIsEnumerable.call(source, key)) continue; target[key] = source[key]; } } return target; } function _objectWithoutPropertiesLoose(source, excluded) { if (source == null) return {}; var target = {}; var sourceKeys = Object.keys(source); var key, i; for (i = 0; i < sourceKeys.length; i++) { key = sourceKeys[i]; if (excluded.indexOf(key) >= 0) continue; target[key] = source[key]; } return target; } function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; } const defaultWaitUntil = 'networkidle'; const defaultRetryLimit = (0, _readConfigs.readConfigs)('retryLimit', { defaultValue: 0 }); const delayInMs = (0, _readConfigs.readConfigs)('delayInMs', { defaultValue: 0 }); const headless = (0, _readConfigs.readConfigs)('headless', { defaultValue: true }); const userAgent = (0, _readConfigs.readConfigs)('userAgents.default', { required: true }); const attributeSelectorPattern = /\/@([A-Za-z]+)[^/]$/; const textSelectorPattern = /(\/|::)text\(\)(\[[0-9]+])?$/; class PlaywrightScraper { constructor(options = {}) { _defineProperty(this, "browser", null); _defineProperty(this, "browserOptions", {}); _defineProperty(this, "context", null); _defineProperty(this, "logger", void 0); _defineProperty(this, "database", null); _defineProperty(this, "pages", []); _defineProperty(this, "runLevel", 0); _defineProperty(this, "state", new _State.State()); _defineProperty(this, "variableDelayInMs", void 0); const { database, logDir, logFileName, variableDelayInMs = 0, verbose } = options, browserOptions = _objectWithoutProperties(options, ["database", "logDir", "logFileName", "variableDelayInMs", "verbose"]); if (database) { this.database = database; } this.variableDelayInMs = variableDelayInMs; this.browserOptions = _objectSpread({ delayInMs, headless, userAgent }, browserOptions); this.logger = new _Logger.Logger({ logDir, logFileName, verbose }); } get currentPage() { return this.pages[this.pages.length - 1]; } addToLog(message, options = {}) { this.logger.add(message, _objectSpread(_objectSpread({}, options), {}, { runLevel: this.runLevel })); } clearState() { this.state.clear(); } async click(query, options = {}) { const { baseRef = this.currentPage, sel } = query; if (!baseRef) { throw new Error('No baseRef'); } const clickOptions = _objectSpread({ delay: 25, noWaitAfter: true }, options); const { throwOnWaitTimeout = false, waitTimeoutInMs = 30000 } = options; this.addToLog(`click: waiting for selector: '${sel}'`); await baseRef.waitForSelector(sel, { timeout: waitTimeoutInMs }).then(() => { this.addToLog(`click: clicking selector: '${sel}'`); return baseRef.click(sel, clickOptions); }).catch(error => { this.addToLog(`Element to be clicked did not appear after ${waitTimeoutInMs}ms`); if (throwOnWaitTimeout) { throw error; } }); } async close() { var _this$context, _this$browser; await this.closeAllPages(); await ((_this$context = this.context) === null || _this$context === void 0 ? void 0 : _this$context.close()); await ((_this$browser = this.browser) === null || _this$browser === void 0 ? void 0 : _this$browser.close()); this.pages = []; this.context = null; this.browser = null; } async closeAllPages() { while (this.pages.length > 0) { await this.closeLatestPage(); } } async closeLatestPage() { var _this$pages$pop; await ((_this$pages$pop = this.pages.pop()) === null || _this$pages$pop === void 0 ? void 0 : _this$pages$pop.close()); } async count(query) { var _this$currentPage; const normalizedQuery = (0, _normalizeQuery.normalizeQuery)(query); const { sel, limit, slice } = normalizedQuery; const handles = (await ((_this$currentPage = this.currentPage) === null || _this$currentPage === void 0 ? void 0 : _this$currentPage.$$(sel))) || []; return (0, _sliceArray.sliceArray)(handles, slice || limit).length; } async follow(query, navOptions = {}) { if (!this.currentPage) { throw new Error('No page is open'); } const { sel } = query; const href = await this.getAttribute(_objectSpread(_objectSpread({}, query), {}, { attr: 'href' })); if (!href) { throw new Error(`No match for 'follow' query selector: '${sel}@href'`); } this.addToLog(`follow: ${sel}/@href="${href}"`); const documentHref = await this.currentPage.evaluate(() => document.location.href); return this.goTo((0, _resolveHref.resolveHref)(documentHref, href), navOptions); } async get(query) { const { attr, scope = 'one' } = query; if (attributeSelectorPattern.test(query.sel) || textSelectorPattern.test(query.sel)) { return scope === 'one' ? this.getTextValue(query) : this.getTextValueAll(query); } if (scope === 'one') { return attr ? this.getAttribute(query) : this.getTextContent(query); } return attr ? this.getAttributeAll(query) : this.getTextContentAll(query); } async getContext() { var _this$browser2, _this$currentPage2, _this$currentPage3; return { browser: { version: await ((_this$browser2 = this.browser) === null || _this$browser2 === void 0 ? void 0 : _this$browser2.version()) }, page: { url: await ((_this$currentPage2 = this.currentPage) === null || _this$currentPage2 === void 0 ? void 0 : _this$currentPage2.url()), title: await ((_this$currentPage3 = this.currentPage) === null || _this$currentPage3 === void 0 ? void 0 : _this$currentPage3.title()) } }; } async getPageContent() { var _this$currentPage4; return (await ((_this$currentPage4 = this.currentPage) === null || _this$currentPage4 === void 0 ? void 0 : _this$currentPage4.content())) || null; } getState() { return this.state.get(); } async getAttribute(query) { const { attr, noTrim = false, transform } = query; if (!attr) { throw new Error('attr (attribute name) cannot be empty'); } const elementRef = await this.select(query); if (!elementRef) { return null; } if (attr === 'outerHTML') { return this.getOuterHtml(query); } const attributeValue = await (0, _getAttribute.getAttribute)(elementRef, attr, { noTrim }); return transform ? transform(attributeValue, await this.getContext()) : attributeValue; } async getAttributeAll(query) { const { attr, noTrim = false, transform } = query; if (!attr) { throw new Error('attr (attribute name) cannot be empty'); } if (attr === 'outerHTML') { return this.getOuterHtmlAll(query); } const handles = await this.selectAll(query); const attributeValues = await Promise.all(handles.map(handle => (0, _getAttribute.getAttribute)(handle, attr, { noTrim }))); return transform ? await Promise.all(attributeValues.map(async attributeValue => transform(attributeValue, await this.getContext()))) : attributeValues; } async getOuterHtml(query) { const { baseRef = this.currentPage, sel, transform } = query; const outerHtml = (await (baseRef === null || baseRef === void 0 ? void 0 : baseRef.$eval(sel, element => element.outerHTML))) || null; return transform ? transform(outerHtml, await this.getContext()) : outerHtml; } async getOuterHtmlAll(query) { const { baseRef = this.currentPage, limit, sel, slice, transform } = query; const outerHtmlContents = (await (baseRef === null || baseRef === void 0 ? void 0 : baseRef.$$eval(sel, elements => elements.map(element => element.outerHTML)))) || []; const slicedContents = (0, _sliceArray.sliceArray)(outerHtmlContents, slice || limit); return transform ? await Promise.all(slicedContents.map(async content => transform(content, await this.getContext()))) : slicedContents; } async getTextContent(query) { const { noTrim = false, transform } = query; if (!this.currentPage) { return null; } const elementRef = await this.select(query); if (!elementRef) { return null; } const textContent = await elementRef.textContent(); const trimmedContent = !textContent || noTrim ? textContent : textContent.trim(); return transform ? transform(trimmedContent, await this.getContext()) : trimmedContent; } async getTextContentAll(query) { const { noTrim, transform } = query; const handles = await this.selectAll(query); const textContents = await Promise.all(handles.map(async handle => await handle.textContent())); const trimmedContents = textContents.map(textContent => !textContent || noTrim ? textContent : textContent.trim()); return transform ? await Promise.all(trimmedContents.map(async content => transform(content, await this.getContext()))) : trimmedContents; } async getTextValue(query) { var _this$currentPage5; const { noTrim = false, transform } = query; if (!this.currentPage) { throw new Error('No page is open'); } const result = await ((_this$currentPage5 = this.currentPage) === null || _this$currentPage5 === void 0 ? void 0 : _this$currentPage5.evaluate(selector => document.evaluate(selector, document, null, XPathResult.STRING_TYPE, null).stringValue, query.sel)); if ((0, _sugarbowl.isDefinite)(result)) { const trimmedValue = noTrim ? result : result.trim(); return transform ? transform(trimmedValue, await this.getContext()) : trimmedValue; } return null; } async getTextValueAll(query) { const { noTrim = false, transform } = query; if (!this.currentPage) { throw new Error('No page is open'); } const textValues = await this.currentPage.evaluate(selector => { const xPathResult = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null); if (!xPathResult) { return []; } const values = []; let node; while (node = xPathResult.iterateNext()) { if (node instanceof Attr) { values.push(node.value); } if (node instanceof Text) { values.push(node.wholeText); } } return values; }, query.sel); const trimmedValues = noTrim ? textValues : textValues.map(_sugarbowl.trim); return transform ? Promise.all(trimmedValues.map(async value => transform(value, await this.getContext()))) : trimmedValues; } async goTo(url, options = {}) { const { addUrlToState, newPage, retryLimit = defaultRetryLimit, state = this.state, validate, verbose = false, waitUntil = defaultWaitUntil } = options; this.addToLog(`goTo: '${url}'`); if (validate) { const { messages = [] } = validate(url) || {}; if (messages.length > 0) { this.addToLog(messages, { prefix: 'skipping: ' }); return { page: {} }; } } if (this.currentPage && this.variableDelayInMs) { await (0, _sugarbowl.sleep)((0, _sugarbowl.generateRandomInt)(0, this.variableDelayInMs)); } const page = !newPage && this.currentPage || (await this.newPage()); if (!page) { throw new Error('No page is open'); } let done = false; let attemptNumber = 0; const maxAttempts = retryLimit + 1; while (!done && attemptNumber <= maxAttempts) { attemptNumber += 1; (0, _sugarbowl.consoleIf)(verbose)([`Navigating to URL: ${url}`, ...(0, _sugarbowl.includeIf)(attemptNumber > 1, `(attempt ${attemptNumber})`)].join(' ')); await page.goto(url, { waitUntil }).then(async () => { done = true; const context = await this.getContext(); if (context.page.url === 'chrome-error://chromewebdata/') { throw new Error('Browser error'); } done = true; }).catch(error => { this.addToLog({ code: error.code, message: error.message.includes('\n') ? error.message.split('\n') : error.message, operation: 'goTo', url, attemptNumber }, { prefix: 'warning: ' }); if (attemptNumber === maxAttempts) { throw new Error(['Aborted after', (0, _sugarbowl.inflectQuantity)(attemptNumber, 'attempt')].join(' ')); } }); } const actualUrl = (await this.getContext()).page.url; if (addUrlToState) { state.update({ url: actualUrl }); } if (actualUrl !== url) { (0, _sugarbowl.consoleIf)(verbose)(`Redirected to ${actualUrl}`); this.addToLog(`[redirected to '${actualUrl}']`); } return this.getContext(); } async has(query) { var _this$currentPage6; const { sel, nth } = query; if (nth) { return (await this.count(query)) >= nth; } return !!(await ((_this$currentPage6 = this.currentPage) === null || _this$currentPage6 === void 0 ? void 0 : _this$currentPage6.$(sel))); } isConnected() { return !!(this.context && this.browser && this.browser.isConnected()); } async launch() { if (this.browser) { throw new Error('The browser has already been launched'); } const { delayInMs: slowMo, headless, userAgent } = this.browserOptions; this.browser = await _playwright.default.chromium.launch({ headless, slowMo }); this.context = await this.browser.newContext({ userAgent }); return { version: this.browser.version() }; } async newPage() { if (!this.context) { await this.launch(); } const page = await this.context.newPage(); this.pages.push(page); return page; } async query(queryDict, options = {}) { const { baseRef, state = this.state, statePath, transform, updateState } = options; this.addToLog(queryDict, { prefix: 'query' }); const queryResult = await (async () => { let results = {}; for (const queryEntry of Object.entries((0, _normalizeQueryDict.normalizeQueryDict)(queryDict))) { const [key, query] = queryEntry; const result = await this.get((0, _sugarbowl.omitUndefined)(_objectSpread(_objectSpread({}, query), {}, { baseRef }))); results = _objectSpread(_objectSpread({}, results), {}, { [key]: result }); } return transform ? transform(results, await this.getContext()) : results; })(); this.addToLog(queryResult, { prefix: 'result' }); if (updateState) { state.update(queryResult, { statePath }); } return queryResult; } async run(commandsOrBuilder, options = {}) { const { collectionName = 'records', displayLog, nth, retryLimit = 2, state = this.state, statePath } = options; const commands = commandsOrBuilder instanceof Array ? commandsOrBuilder : commandsOrBuilder.commands; let { baseRef } = options; let pagesToCloseCount = 0; let stopRun = false; this.runLevel += 1; if (nth) { this.addToLog(`- run ${nth}`); } for (const command of commands) { if (stopRun) { break; } switch (command.action) { case 'click': await this.click(command.query, command.options); break; case 'follow': { const { query, options: navOptions = {} } = command; const pageContext = await this.follow(_objectSpread({ baseRef }, query), _objectSpread({ addUrlToState: true, newPage: true, retryLimit, state }, navOptions)).then(context => { if (!context.page.url) { stopRun = true; } return context; }).catch(error => { this.addToLog({ operation: 'follow', query, code: error.code, message: error.message.includes('\n') ? error.message.split('\n') : error.message, retryLimit }, { prefix: 'warning: ' }); stopRun = true; return this.getContext(); }); if (pageContext.page.url) { pagesToCloseCount += 1; baseRef = undefined; } break; } case 'goTo': { var _pageContext$page; const { url, options: navOptions = {} } = command; const mergedNavOptions = _objectSpread({ addUrlToState: true, newPage: true, retryLimit }, navOptions); const pageContext = await this.goTo(url, mergedNavOptions).then(context => { if (!context.page.url) { stopRun = true; } return context; }).catch(error => { this.addToLog({ operation: 'goTo', url, code: error.code, message: error.message.includes('\n') ? error.message.split('\n') : error.message, retryLimit }, { prefix: 'warning: ' }); stopRun = true; return this.getContext(); }); if ((_pageContext$page = pageContext.page) !== null && _pageContext$page !== void 0 && _pageContext$page.url) { pagesToCloseCount += 1; baseRef = undefined; } break; } case 'query': { const { queryDict, options: queryOptions = {} } = command; await this.query(queryDict, (0, _sugarbowl.omitUndefined)(_objectSpread({ baseRef, state, statePath, updateState: true }, queryOptions))); break; } case 'runOnAll': { const { query, commands, options: runOnAllOptions } = command; await this.runOnAll(query, commands, (0, _sugarbowl.omitUndefined)(_objectSpread({ baseRef, collectionName, state }, runOnAllOptions))); break; } case 'set': { await state.update(command.state, { statePath }); break; } case 'write': await this.write(command.collectionName || collectionName, { state }); break; default: { const { action } = command; throw new Error(`Unrecognized action: ${action}`); } } } while (pagesToCloseCount > 0) { await this.closeLatestPage(); pagesToCloseCount -= 1; } this.runLevel -= 1; if (this.runLevel === 0) { if (displayLog) { this.logger.display(); } this.logger.write(); } return state.get(); } async runOnAll(query, commands, options = {}) { const { baseRef, collectionName = 'records', state = this.state } = options; const baseRefs = await this.selectAll(_objectSpread(_objectSpread({}, query), {}, { baseRef })); this.addToLog(['runOnAll:', `selector: '${query.sel}',`, `${(0, _pluralize.pluralize)(baseRefs.length, 'match', 'matches')}`, ...(0, _sugarbowl.includeIf)(query.limit, `(max: ${query.limit})`), ...(0, _sugarbowl.includeIf)(query.slice, `(slice: ${query.slice})`)].join(' ')); const records = []; for (const baseRef of baseRefs) { const recordState = state.clone(); const nth = baseRefs.indexOf(baseRef) + 1; await this.run(commands, { baseRef, collectionName, nth, state: recordState }); records.push(recordState.get()); } state.update({ [collectionName]: records }); return records; } async select(query) { if (!query.sel) { if (query.baseRef) { if (query.nth && query.nth > 1) { throw new Error('nth cannot be set when no selector is given'); } return query.baseRef; } } const { baseRef = this.currentPage, sel, nth = 1 } = query; if (!baseRef || nth < 1 || !(0, _sugarbowl.isInteger)(nth)) { return null; } if (nth === 1) { return baseRef.$(sel); } const elementRef = (await baseRef.$$(sel)) || []; if (elementRef.length < nth) { return null; } return elementRef[nth - 1]; } async selectAll(query) { const { baseRef = this.currentPage, sel, limit, slice } = query; const refs = (await (baseRef === null || baseRef === void 0 ? void 0 : baseRef.$$(sel))) || []; return (0, _sliceArray.sliceArray)(refs, slice || limit); } updateState(data, options = {}) { const { state = this.state, statePath } = options; return state.update(data, { statePath }); } async write(collectionName, options = {}) { const { state = this.state } = options; if (!this.database) { throw new Error('No database has been declared'); } if (!collectionName) { throw new Error('`collectionName` cannot be empty'); } const stateData = state.get(); if (!state) { throw new Error('There is no data to write'); } return this.database.update(collectionName, _objectSpread({ retrievedAt: new Date().toISOString() }, stateData)); } } exports.PlaywrightScraper = PlaywrightScraper;