UNPKG

donobu

Version:

Create browser automations with an LLM agent and replay them as Playwright scripts.

297 lines 12.7 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DetectBrokenLinksTool = exports.DetectBrokenLinksGptSchema = exports.DetectBrokenLinksCoreSchema = void 0; const v4_1 = require("zod/v4"); const ToolSchema_1 = require("../models/ToolSchema"); const MiscUtils_1 = require("../utils/MiscUtils"); const PlaywrightUtils_1 = require("../utils/PlaywrightUtils"); const TargetUtils_1 = require("../utils/TargetUtils"); const Tool_1 = require("./Tool"); exports.DetectBrokenLinksCoreSchema = v4_1.z.object({ captureScreenshots: v4_1.z .boolean() .optional() .describe('Whether to capture screenshots for dead links for visual verification.'), }); exports.DetectBrokenLinksGptSchema = v4_1.z.object({ ...ToolSchema_1.BaseGptArgsSchema.shape, ...exports.DetectBrokenLinksCoreSchema.shape, }); class DetectBrokenLinksTool extends Tool_1.Tool { constructor() { super(DetectBrokenLinksTool.NAME, 'Detect broken links in the current page and return a report of them.', exports.DetectBrokenLinksCoreSchema, exports.DetectBrokenLinksGptSchema, false, undefined, ['web']); } async call(context, parameters) { const page = (0, TargetUtils_1.webPage)(context); // Get all links along with their texts. const links = await DetectBrokenLinksTool.getLinksForPage(page); const linksData = await DetectBrokenLinksTool.traverseLinks(page, links); // Optionally capture screenshots for dead links. if (parameters.captureScreenshots) { await Promise.all(Array.from(linksData.entries()).map(async ([originalUrl, result]) => { // Only try to capture screenshot for non-skipped (i.e. HTTP) links if (result.isDead && !result.skipped) { let newPage; try { const finalUrl = result.redirectChain[result.redirectChain.length - 1]; newPage = await page.context().newPage(); await newPage.goto(finalUrl, { timeout: 10000 }); const screenshotBuffer = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(newPage); const imageType = MiscUtils_1.MiscUtils.detectImageType(screenshotBuffer); // Generate a filename (encoded original URL plus timestamp) const screenshotFilename = `dead-link-screenshot-${encodeURIComponent(originalUrl)}-${Date.now()}.${imageType}`; await context.persistence.setFlowFile(context.metadata.id, screenshotFilename, screenshotBuffer); result.screenshotFilename = screenshotFilename; } catch (_error) { // Log error or ignore if screenshot capture fails. } finally { if (newPage) { await newPage.close().catch(() => { }); } } } })); } const linksReport = await DetectBrokenLinksTool.generateJsonReport(page, linksData); const linksReportAsJson = JSON.stringify(linksReport, null, 2); return { isSuccessful: true, forLlm: linksReportAsJson, metadata: linksReport, }; } async callFromGpt(context, parameters) { return this.call(context, parameters); } /** * Returns an array of LinkInfo objects containing a URL and an array of all associated link texts. * Filters out invalid URLs. */ static async getLinksForPage(page) { // Get all <a> tag links. const aTagLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('a')).map((a) => ({ url: a.href, text: (a.textContent || '').trim(), })); }); // Get links from onclick attributes. const onClickLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('[onclick]')) .map((el) => { const onclick = el.getAttribute('onclick') || ''; const match = onclick.match(/window\.location\.href\s*=\s*['"]([^'"]+)['"]/); return match ? { url: match[1], text: (el.textContent || '').trim() } : null; }) .filter((link) => link !== null); }); // Get links from data-href attributes. const dataHrefLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('[data-href]')).map((el) => ({ url: el.getAttribute('data-href') || '', text: (el.textContent || '').trim(), })); }); // Combine and deduplicate links by URL, accumulating all texts. const linksMap = new Map(); const allLinks = [...aTagLinks, ...onClickLinks, ...dataHrefLinks]; for (const link of allLinks) { try { const _parsed = new URL(link.url); if (!linksMap.has(link.url)) { linksMap.set(link.url, { url: link.url, texts: [link.text] }); } else { const entry = linksMap.get(link.url); if (link.text && !entry.texts.includes(link.text)) { entry.texts.push(link.text); } } } catch { // Ignore invalid URLs. // TODO: add better logic for handling different URL types like file://, etc. } } return Array.from(linksMap.values()); } /** * Checks all links concurrently and returns a map from the original URL to its traversal result. * If a link is a mailto, it is marked as skipped. */ static async traverseLinks(page, links) { const results = new Map(); const maxRedirects = 5; await Promise.all(links.map(async (link) => { // If it's a mailto link, record it as skipped. if (link.url.startsWith('mailto:')) { results.set(link.url, { redirectChain: [link.url], responseBody: 'Mailto link: not traversed', responseCode: 0, isDead: false, skipped: true, linkTexts: link.texts, }); return; } try { const result = await DetectBrokenLinksTool.traverseSingleLink(page, link.url, maxRedirects); results.set(link.url, { ...result, linkTexts: link.texts }); } catch (error) { results.set(link.url, { redirectChain: [link.url], responseBody: error instanceof Error ? error.toString() : String(error), responseCode: 0, isDead: true, linkTexts: link.texts, }); } })); return results; } /** * Traverses a single link, following redirects up to maxRedirects. * Returns a LinkTraversalResult that includes the full redirect chain. */ static async traverseSingleLink(page, link, maxRedirects) { let currentUrl = link; const redirectChain = [link]; let redirectCount = 0; while (redirectCount < maxRedirects) { const response = await page.context().request.get(currentUrl, { maxRedirects: 0, timeout: 10000, failOnStatusCode: false, }); const responseCode = response.status(); // Handle redirects. if (responseCode >= 300 && responseCode < 400) { const location = response.headers()['location']; if (!location) { // No location header — cannot follow redirect. return { redirectChain, responseBody: '', responseCode, isDead: true, }; } try { // Resolve relative URLs using currentUrl as base. currentUrl = new URL(location, currentUrl).toString(); redirectChain.push(currentUrl); redirectCount++; continue; } catch { return { redirectChain, responseBody: '', responseCode, isDead: true, }; } } else { // Instead of using the raw response body, load the page in a temporary context // and extract the visible (rendered) text only. let visibleText = ''; let tempPage; try { tempPage = await page.context().newPage(); await tempPage.goto(currentUrl, { timeout: 10000 }); visibleText = await tempPage.locator('body').innerText({ timeout: 5000, }); } catch (_error) { // In case of any error during page load, visibleText remains empty. } finally { if (tempPage) { await tempPage.close().catch(() => { }); } } const lowerVisibleText = visibleText.toLowerCase(); // Define the textual indicators which suggest a broken (dead) page. const deadLinkIndicators = [ 'page not found', '404 error', 'content not available', ]; // A link is marked as dead if the HTTP status is 404, 410, any 5xx, // or if any error-indicating phrase is present in the visible text. const isDead = responseCode === 404 || responseCode === 410 || responseCode >= 500 || deadLinkIndicators.some((indicator) => lowerVisibleText.includes(indicator)); return { redirectChain, responseBody: visibleText, // now contains only the visible text responseCode, isDead, }; } } // If max redirects reached without a final response, mark as dead. return { redirectChain, responseBody: '', responseCode: 0, isDead: true, }; } /** * Generates a JSON report of the link scan. * The report now includes a "linkTexts" field for each link. */ static async generateJsonReport(page, linksData) { const currentPageUrl = page.url(); const deadLinks = []; const workingLinks = []; linksData.forEach((result, originalUrl) => { const finalUrl = result.redirectChain[result.redirectChain.length - 1]; const linkData = { originalUrl, linkTexts: result.linkTexts, // All accumulated texts. finalUrl, responseCode: result.responseCode, isDead: result.isDead, redirectChain: result.redirectChain, error: result.responseCode === 0 ? result.responseBody : undefined, }; if (result.screenshotFilename) { linkData.screenshot = result.screenshotFilename; } if (result.skipped) { linkData.skipped = true; } if (result.isDead) { deadLinks.push(linkData); } else { workingLinks.push(linkData); } }); return { scanDateTime: new Date().toISOString(), sourcePageUrl: currentPageUrl, summary: { totalLinks: linksData.size, deadLinks: deadLinks.length, workingLinks: workingLinks.length, }, deadLinks, workingLinks, }; } } exports.DetectBrokenLinksTool = DetectBrokenLinksTool; DetectBrokenLinksTool.NAME = 'detectBrokenLinks'; //# sourceMappingURL=DetectBrokenLinksTool.js.map