UNPKG

donobu

Version:

Create browser automations with an LLM agent and replay them as Playwright scripts.

264 lines 11 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DetectBrokenLinksTool = void 0; const Tool_1 = require("./Tool"); const PlaywrightUtils_1 = require("../utils/PlaywrightUtils"); class DetectBrokenLinksTool extends Tool_1.Tool { constructor() { super(DetectBrokenLinksTool.NAME, 'Detect broken links in the current page and return a report of them.', 'DetectBrokenLinksToolCoreParameters', 'DetectBrokenLinksToolGptParameters'); } async call(context, parameters) { const page = context.page; // Get all links along with their texts. const links = await DetectBrokenLinksTool.getLinksForPage(page); const linksData = await DetectBrokenLinksTool.traverseLinks(page, links); // Optionally capture screenshots for dead links. if (parameters.captureScreenshots) { await Promise.all(Array.from(linksData.entries()).map(async ([originalUrl, result]) => { // Only try to capture screenshot for non-skipped (i.e. HTTP) links if (result.isDead && !result.skipped) { let newPage; try { const finalUrl = result.redirectChain[result.redirectChain.length - 1]; newPage = await page.context().newPage(); await newPage.goto(finalUrl, { timeout: 10000 }); const screenshotBuffer = await PlaywrightUtils_1.PlaywrightUtils.takePngScreenshot(newPage); // Generate a filename (encoded original URL plus timestamp) const screenshotFilename = `dead-link-screenshot-${encodeURIComponent(originalUrl)}-${Date.now()}.png`; await context.persistence.setFlowFile(context.metadata.id, screenshotFilename, screenshotBuffer); result.screenshotFilename = screenshotFilename; } catch (_error) { // Log error or ignore if screenshot capture fails. } finally { if (newPage) { await newPage.close().catch(() => { }); } } } })); } const linksReport = await DetectBrokenLinksTool.generateJsonReport(page, linksData); const linksReportAsJson = JSON.stringify(linksReport, null, 2); return { isSuccessful: true, forLlm: linksReportAsJson, metadata: linksReport, }; } async callFromGpt(context, parameters) { return this.call(context, parameters); } /** * Returns an array of LinkInfo objects containing a URL and an array of all associated link texts. * Filters out invalid URLs. */ static async getLinksForPage(page) { // Get all <a> tag links. const aTagLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('a')).map((a) => ({ url: a.href, text: (a.textContent || '').trim(), })); }); // Get links from onclick attributes. const onClickLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('[onclick]')) .map((el) => { const onclick = el.getAttribute('onclick') || ''; const match = onclick.match(/window\.location\.href\s*=\s*['"]([^'"]+)['"]/); return match ? { url: match[1], text: (el.textContent || '').trim() } : null; }) .filter((link) => link !== null); }); // Get links from data-href attributes. const dataHrefLinks = await page.evaluate(() => { return Array.from(document.querySelectorAll('[data-href]')).map((el) => ({ url: el.getAttribute('data-href') || '', text: (el.textContent || '').trim(), })); }); // Combine and deduplicate links by URL, accumulating all texts. const linksMap = new Map(); const allLinks = [...aTagLinks, ...onClickLinks, ...dataHrefLinks]; for (const link of allLinks) { try { const _parsed = new URL(link.url); if (!linksMap.has(link.url)) { linksMap.set(link.url, { url: link.url, texts: [link.text] }); } else { const entry = linksMap.get(link.url); if (link.text && !entry.texts.includes(link.text)) { entry.texts.push(link.text); } } } catch { // Ignore invalid URLs. // TODO: add better logic for handling different URL types like file://, etc. } } return Array.from(linksMap.values()); } /** * Checks all links concurrently and returns a map from the original URL to its traversal result. * If a link is a mailto, it is marked as skipped. */ static async traverseLinks(page, links) { const results = new Map(); const maxRedirects = 5; await Promise.all(links.map(async (link) => { // If it's a mailto link, record it as skipped. if (link.url.startsWith('mailto:')) { results.set(link.url, { redirectChain: [link.url], responseBody: 'Mailto link: not traversed', responseCode: 0, isDead: false, skipped: true, linkTexts: link.texts, }); return; } try { const result = await this.traverseSingleLink(page, link.url, maxRedirects); results.set(link.url, { ...result, linkTexts: link.texts }); } catch (error) { results.set(link.url, { redirectChain: [link.url], responseBody: error instanceof Error ? error.toString() : String(error), responseCode: 0, isDead: true, linkTexts: link.texts, }); } })); return results; } /** * Traverses a single link, following redirects up to maxRedirects. * Returns a LinkTraversalResult that includes the full redirect chain. */ static async traverseSingleLink(page, link, maxRedirects) { let currentUrl = link; const redirectChain = [link]; let redirectCount = 0; while (redirectCount < maxRedirects) { const response = await page.context().request.get(currentUrl, { maxRedirects: 0, timeout: 10000, failOnStatusCode: false, }); const responseBody = await response.text(); const responseCode = response.status(); // TODO: Define better and more specific textual indicators. const deadLinkIndicators = [ 'page not found', '404 error', 'content not available', ]; const lowerBody = responseBody.toLowerCase(); // Mark as dead if status code is 404, 410, or >=500, // or if one of the more specific phrases is found. const isDead = responseCode === 404 || responseCode === 410 || responseCode >= 500 || deadLinkIndicators.some((indicator) => lowerBody.includes(indicator)); // Handle redirects. if (responseCode >= 300 && responseCode < 400) { const location = response.headers()['location']; if (!location) { // No location header — cannot follow redirect. return { redirectChain, responseBody, responseCode, isDead: true, }; } try { // Using new URL with currentUrl as base handles relative URLs. currentUrl = new URL(location, currentUrl).toString(); redirectChain.push(currentUrl); redirectCount++; continue; } catch { return { redirectChain, responseBody, responseCode, isDead: true, }; } } else { return { redirectChain, responseBody, responseCode, isDead, }; } } // If max redirects reached without a final response, mark as dead. return { redirectChain, responseBody: '', responseCode: 0, isDead: true, }; } /** * Generates a JSON report of the link scan. * The report now includes a "linkTexts" field for each link. */ static async generateJsonReport(page, linksData) { const currentPageUrl = page.url(); const deadLinks = []; const workingLinks = []; linksData.forEach((result, originalUrl) => { const finalUrl = result.redirectChain[result.redirectChain.length - 1]; const linkData = { originalUrl, linkTexts: result.linkTexts, // All accumulated texts. finalUrl, responseCode: result.responseCode, isDead: result.isDead, redirectChain: result.redirectChain, error: result.responseCode === 0 ? result.responseBody : undefined, }; if (result.screenshotFilename) { linkData.screenshot = result.screenshotFilename; } if (result.skipped) { linkData.skipped = true; } if (result.isDead) { deadLinks.push(linkData); } else { workingLinks.push(linkData); } }); return { scanDateTime: new Date().toISOString(), sourcePageUrl: currentPageUrl, summary: { totalLinks: linksData.size, deadLinks: deadLinks.length, workingLinks: workingLinks.length, }, deadLinks, workingLinks, }; } } exports.DetectBrokenLinksTool = DetectBrokenLinksTool; DetectBrokenLinksTool.NAME = 'detectBrokenLinks'; //# sourceMappingURL=DetectBrokenLinksTool.js.map