stepwright
Version:
A powerful web scraping library built with Playwright
526 lines • 24.7 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.executeStepInContext = executeStepInContext;
exports.executeStep = executeStep;
exports.executeStepList = executeStepList;
const scraper_1 = require("./scraper");
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const utils_1 = require("./utils");
// Import global types
require("./global-types");
/**
* Execute a step in the context of a specific element.
*
* @param {object} page - The page object.
* @param {object} contextElement - The context element.
* @param {object} step - The step object.
* @param {object} collector - The collector object.
*
* @since v1.0.0
* @author Muhammad Umer Farooq <umer@lablnet.com>
*
* @returns {void} - Nothing.
* @since v1.0.0
* @company Framework Island
*/
async function executeStepInContext(page, contextElement, step, collector) {
console.log(`➡️ Step \`${step.id}\` (${step.action}) in context`);
switch (step.action) {
case 'data': {
try {
// Find element within the context
let targetElement;
if (step.object_type === 'tag') {
targetElement = contextElement.locator(step.object ?? '');
}
else if (step.object_type === 'class') {
targetElement = contextElement.locator(`.${step.object ?? ''}`);
}
else if (step.object_type === 'id') {
targetElement = contextElement.locator(`#${step.object ?? ''}`);
}
else if (step.object_type === 'xpath') {
targetElement = contextElement.locator(`xpath=${step.object ?? ''}`);
}
else {
targetElement = contextElement.locator(step.object ?? '');
}
const count = await targetElement.count();
if (count === 0) {
console.log(` ⚠️ Element not found in context: ${step.object} - skipping data extraction`);
const key = step.key || step.id || 'data';
collector[key] = '';
return;
}
// Get data from the first matching element
let value;
switch (step.data_type) {
case 'text':
value = (await targetElement.first().textContent()) ?? '';
break;
case 'html':
value = await targetElement.first().innerHTML();
break;
case 'value':
value = await targetElement.first().getAttribute('href') ?? '';
break;
default:
value = await targetElement.first().innerText();
}
const key = step.key || step.id || 'data';
collector[key] = value;
console.log(`Step Data: ${key}: ${value}`);
}
catch (err) {
console.log(` ⚠️ Data extraction failed in context for ${step.object}: ${err.message}`);
const key = step.key || step.id || 'data';
collector[key] = '';
}
break;
}
default:
// For other actions, fall back to regular executeStep
await executeStep(page, step, collector);
}
if (step.wait && step.wait > 0) {
await page.waitForTimeout(step.wait);
}
}
/**
* Execute a step.
*
* @param {object} page - The page object.
* @param {object} step - The step object.
* @param {object} collector - The collector object.
*
* @since v1.0.0
* @author Muhammad Umer Farooq <umer@lablnet.com>
*
* @returns {void} - Nothing.
* @since v1.0.0
* @company Framework Island
*/
async function executeStep(page, step, collector) {
console.log(`➡️ Step \`${step.id}\` (${step.action})`);
switch (step.action) {
case 'navigate': {
await (0, scraper_1.navigate)(page, step.value ?? '');
break;
}
case 'input': {
await (0, scraper_1.input)(page, step.object_type, step.object ?? '', step.value ?? '', step.wait ?? 0);
break;
}
case 'click': {
try {
// Check if element exists first
const locator = (0, utils_1.locatorFor)(page, step.object_type, step.object ?? '');
const count = await locator.count();
if (count === 0) {
console.log(` ⚠️ Element not found: ${step.object} - skipping click action`);
return;
}
await (0, scraper_1.click)(page, step.object_type, step.object ?? '');
}
catch (err) {
console.log(` ⚠️ Click failed for ${step.object}: ${err.message}`);
// Don't throw error, just continue
}
break;
}
case 'data': {
try {
// Check if element exists first
const locator = (0, utils_1.locatorFor)(page, step.object_type, step.object ?? '');
const count = await locator.count();
if (count === 0) {
console.log(` ⚠️ Element not found: ${step.object} - skipping data extraction`);
const key = step.key || step.id || 'data';
collector[key] = ''; // Set to empty string when element not found
return;
}
const value = await (0, scraper_1.getData)(page, step.object_type, step.object ?? '', step.data_type ?? 'default', step.wait ?? 0);
const key = step.key || step.id || 'data';
collector[key] = value;
console.log(`Step Data: ${key}: ${value}`);
}
catch (err) {
console.log(` ⚠️ Data extraction failed for ${step.object}: ${err.message}`);
const key = step.key || step.id || 'data';
collector[key] = ''; // Set to empty string when data can't be extracted
// Don't throw error, just continue
}
break;
}
case 'download': {
// check if element exists.
// Save downloaded file to path provided in step.value (defaults ./downloads)
if (!step.value) {
throw new Error(`download step ${step.id} requires 'value' as target filepath`);
}
// Determine the key under which to store the downloaded file path
const collectorKey = step.key || step.id || 'file';
let savedPath = null;
try {
const targetLocator = await (0, scraper_1.elem)(page, step.object_type, step.object ?? '');
if (targetLocator) {
const isVisible = await targetLocator.isVisible().catch(() => false);
if (isVisible) {
const dlPromise = page.waitForEvent('download', { timeout: 10000 });
await targetLocator.click();
const dl = await dlPromise;
const savePath = step.value;
const dir = path_1.default.dirname(savePath);
if (!fs_1.default.existsSync(dir)) {
fs_1.default.mkdirSync(dir, { recursive: true });
}
await dl.saveAs(savePath);
savedPath = savePath;
console.log(` 📥 Saved to ${savePath}`);
}
else {
console.log(` 📥 Element not visible or not found: ${step.object}`);
}
}
else {
console.log(` 📥 Element not found: ${step.object}`);
}
}
catch (err) {
console.log(` 📥 Download failed for ${step.object}: ${err.message}`);
// Don't throw error, just continue
}
finally {
// Record the file path (or null if not downloaded) in the collector
collector[collectorKey] = savedPath;
}
break;
}
case 'foreach': {
if (!step.object)
throw new Error('foreach step requires object as locator');
if (!step.subSteps || step.subSteps.length === 0) {
throw new Error('foreach step requires subSteps');
}
const locatorAll = (0, utils_1.locatorFor)(page, step.object_type, step.object);
try {
await locatorAll.first().waitFor({ state: 'attached', timeout: step.wait ?? 5000 });
}
catch { }
const count = await locatorAll.count();
console.log(` 🔁 foreach found ${count} items for selector ${step.object}`);
for (let idx = 0; idx < count; idx++) {
const current = locatorAll.nth(idx);
await current.scrollIntoViewIfNeeded();
// Create a separate collector for each iteration
const itemCollector = {};
// For each subStep clone and replace placeholders
for (const s of step.subSteps) {
const cloned = (0, utils_1.cloneStepWithIndex)(s, idx);
try {
// Execute step in the context of the current item
await executeStepInContext(page, current, cloned, itemCollector);
}
catch (err) {
console.log(`⚠️ sub-step '${cloned.id}' failed: ${err.message}`);
if (cloned.terminateonerror)
throw err;
}
}
// Store the item collector with a unique key for this iteration
collector[`item_${idx}`] = itemCollector;
// If we have collected data for this item, emit it immediately for streaming
if (Object.keys(itemCollector).length > 0) {
console.log(` 📋 Collected data for item ${idx}:`, Object.keys(itemCollector));
// Emit the result immediately for streaming
// We need to access the onResult callback from the parent context
// This is a bit of a hack, but it works for immediate streaming
if (global.onResultCallback) {
try {
await global.onResultCallback(itemCollector, idx);
}
catch (err) {
console.log(` ⚠️ Callback failed for item ${idx}: ${err}`);
}
}
}
}
break;
}
case 'open': {
if (!step.object)
throw new Error('open step requires object locator');
if (!step.subSteps || step.subSteps.length === 0)
throw new Error('open step needs subSteps');
console.log(` 🔗 Opening link/tab from selector ${step.object}`);
try {
// locate link and check if it exists
const linkLoc = (0, utils_1.locatorFor)(page, step.object_type, step.object);
const count = await linkLoc.count();
if (count === 0) {
console.log(` ⚠️ Element not found: ${step.object} - skipping open action`);
return;
}
let href = await linkLoc.getAttribute('href');
let newPage = null;
const context = page.context();
if (href) {
// absolute or relative URL
if (!href.startsWith('http')) {
const base = page.url();
href = new URL(href, base).toString();
}
newPage = await context.newPage();
await newPage.goto(href, { waitUntil: 'networkidle' });
}
else {
// fallback: click with modifier to open new tab
const pagePromise = context.waitForEvent('page');
await linkLoc.click({ modifiers: ['Meta'] }).catch(() => linkLoc.click());
newPage = await pagePromise;
await newPage.waitForLoadState('networkidle');
}
// Pass the parent collector data to subSteps so they can access meeting_title, meeting_date, etc.
const innerCollected = { ...collector };
for (const s of step.subSteps) {
const cloned = { ...s };
try {
await executeStep(newPage, cloned, innerCollected);
}
catch (err) {
console.log(` ⚠️ Sub-step in open failed: ${err.message}`);
if (cloned.terminateonerror)
throw err;
}
}
// merge into collector
Object.assign(collector, innerCollected);
console.log(' 🔙 Closed child tab');
await newPage.close();
}
catch (err) {
console.log(` ⚠️ Open action failed for ${step.object}: ${err.message}`);
if (step.terminateonerror)
throw err;
}
break;
}
case 'scroll': {
// Scroll the page by given offset or full height
const offset = step.value ? parseInt(step.value, 10) : await page.evaluate(() => window.innerHeight);
await page.evaluate((y) => window.scrollBy(0, y), offset);
break;
}
case 'savePDF': {
// Save PDF content from current page to file
if (!step.value) {
throw new Error(`savePDF step ${step.id} requires 'value' as target filepath`);
}
const collectorKey = step.key || step.id || 'file';
let savedPath = null;
try {
console.log(` 📄 Waiting for PDF content to load...`);
// Strategy 1: Wait for DOM content loaded first, //timeout 10 minutes
try {
await page.waitForLoadState('domcontentloaded', { timeout: step.wait ?? 600000 });
console.log(` 📄 DOM content loaded`);
}
catch (domErr) {
console.log(` 📄 DOM content timeout, continuing anyway`);
}
// Strategy 2: Wait for PDF-specific elements or indicators
let pdfReady = false;
const maxAttempts = 15; // Increased attempts for PDF loading
let attempts = 0;
while (!pdfReady && attempts < maxAttempts) {
attempts++;
console.log(` 📄 Checking PDF readiness (attempt ${attempts}/${maxAttempts})`);
try {
// Check if page has PDF content indicators
const hasPdfContent = await page.evaluate(() => {
// Check for PDF viewer elements
const pdfViewer = document.querySelector('embed[type="application/pdf"]') ||
document.querySelector('object[type="application/pdf"]') ||
document.querySelector('iframe[src*=".pdf"]') ||
document.querySelector('.pdf-viewer') ||
document.querySelector('[data-pdf]');
// Check if page content is substantial (not just loading screen)
const bodyText = document.body ? document.body.innerText : '';
const hasSubstantialContent = bodyText.length > 200; // Increased threshold
// Check if page is visible
const isVisible = document.body &&
document.body.style.display !== 'none' &&
document.body.style.visibility !== 'hidden';
// Check for PDF-specific content
const hasPdfText = bodyText.includes('PDF') ||
bodyText.includes('Page') ||
bodyText.includes('Agenda') ||
bodyText.includes('Meeting');
return {
hasPdfViewer: !!pdfViewer,
hasSubstantialContent,
isVisible,
bodyTextLength: bodyText.length,
hasPdfText
};
});
console.log(` 📄 PDF check:`, hasPdfContent);
// Only consider ready if we have substantial content OR PDF text
if (hasPdfContent.hasSubstantialContent || hasPdfContent.hasPdfText) {
pdfReady = true;
console.log(` 📄 PDF content appears ready (substantial content or PDF text found)`);
break;
}
else {
console.log(` 📄 PDF not ready yet - content: ${hasPdfContent.hasSubstantialContent}, text length: ${hasPdfContent.bodyTextLength}, hasPdfText: ${hasPdfContent.hasPdfText}`);
}
// Wait a bit before next check
await page.waitForTimeout(2000); // Increased wait time
}
catch (checkErr) {
console.log(` 📄 PDF check failed: ${checkErr.message}`);
await page.waitForTimeout(2000);
}
}
// Strategy 3: Additional wait for any dynamic content
if (step.wait && step.wait > 0) {
console.log(` 📄 Additional wait: ${step.wait}ms`);
await page.waitForTimeout(step.wait);
}
console.log(` 📄 Capturing PDF...`);
// Get the PDF content as buffer
const pdfBuffer = await page.pdf({ format: 'A4' });
// Ensure directory exists
const savePath = (0, utils_1.replaceDataPlaceholders)(step.value, collector) || step.value || '';
const dir = path_1.default.dirname(savePath);
if (!fs_1.default.existsSync(dir)) {
fs_1.default.mkdirSync(dir, { recursive: true });
}
// Save the PDF
fs_1.default.writeFileSync(savePath, pdfBuffer);
savedPath = savePath;
console.log(` 📄 PDF saved to ${savePath}`);
}
catch (err) {
console.log(` 📄 PDF save failed: ${err.message}`);
// Don't throw error, just continue
}
finally {
// Record the file path (or null if not saved) in the collector
collector[collectorKey] = savedPath;
}
break;
}
case 'printToPDF': {
// Click button to open print dialog and save as PDF
if (!step.value) {
throw new Error(`printToPDF step ${step.id} requires 'value' as target filepath`);
}
const collectorKey = step.key || step.id || 'file';
let savedPath = null;
try {
// Check if element exists first
const locator = (0, utils_1.locatorFor)(page, step.object_type, step.object ?? '');
const count = await locator.count();
if (count === 0) {
console.log(` ⚠️ Element not found: ${step.object} - skipping printToPDF action`);
return;
}
console.log(` 🖨️ Attempting to print PDF from element: ${step.object}`);
// Set up download listener with shorter timeout
const downloadPromise = page.waitForEvent('download', { timeout: 10000 }).catch(() => null);
// Click the button that opens print dialog
await locator.click();
console.log(` 🖨️ Clicked print button`);
// Wait a moment for print dialog to appear
await page.waitForTimeout(2000);
// Try multiple approaches to handle print dialog
let download = null;
try {
// Approach 1: Try keyboard shortcuts
console.log(` 🖨️ Trying keyboard shortcuts (Ctrl+P)`);
await page.keyboard.press('Control+P');
await page.waitForTimeout(2000);
await page.keyboard.press('Enter');
// Wait for download with shorter timeout
download = await downloadPromise;
}
catch (keyboardErr) {
console.log(` 🖨️ Keyboard shortcuts failed: ${keyboardErr.message}`);
// Approach 2: Try clicking the print button again if it's still there
try {
console.log(` 🖨️ Trying direct print button click`);
await locator.click();
await page.waitForTimeout(3000);
download = await page.waitForEvent('download', { timeout: 5000 }).catch(() => null);
}
catch (clickErr) {
console.log(` 🖨️ Direct click also failed: ${clickErr.message}`);
}
}
if (download) {
// Ensure directory exists
const savePath = step.value;
const dir = path_1.default.dirname(savePath);
if (!fs_1.default.existsSync(dir)) {
fs_1.default.mkdirSync(dir, { recursive: true });
}
// Save the downloaded file
await download.saveAs(savePath);
savedPath = savePath;
console.log(` 🖨️ Print PDF saved to ${savePath}`);
}
else {
console.log(` 🖨️ No download event detected - print dialog may not have worked`);
}
}
catch (err) {
console.log(` 🖨️ PrintToPDF failed: ${err.message}`);
// Don't throw error, just continue
}
finally {
// Record the file path (or null if not saved) in the collector
collector[collectorKey] = savedPath;
}
break;
}
default:
// Unhandled action – ignore to be future-proof
break;
}
if (step.wait && step.wait > 0) {
await page.waitForTimeout(step.wait);
}
}
/**
* Execute a step list.
*
* @param {object} page - The page object.
* @param {object} steps - The steps object.
* @param {object} collected - The collected object.
*
* @since v1.0.0
* @author Muhammad Umer Farooq <umer@lablnet.com>
*
* @returns {void} - Nothing.
* @since v1.0.0
* @company Framework Island
*/
async function executeStepList(page, steps, collected) {
console.log(`📝 Executing ${steps.length} step(s)`);
for (const step of steps) {
try {
await executeStep(page, step, collected);
}
catch (err) {
if (step.terminateonerror) {
throw err;
}
}
}
}
//# sourceMappingURL=step-executor.js.map