donobu
Version:
Create browser automations with an LLM agent and replay them as Playwright scripts.
297 lines • 12.7 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.DetectBrokenLinksTool = exports.DetectBrokenLinksGptSchema = exports.DetectBrokenLinksCoreSchema = void 0;
const v4_1 = require("zod/v4");
const ToolSchema_1 = require("../models/ToolSchema");
const MiscUtils_1 = require("../utils/MiscUtils");
const PlaywrightUtils_1 = require("../utils/PlaywrightUtils");
const TargetUtils_1 = require("../utils/TargetUtils");
const Tool_1 = require("./Tool");
exports.DetectBrokenLinksCoreSchema = v4_1.z.object({
captureScreenshots: v4_1.z
.boolean()
.optional()
.describe('Whether to capture screenshots for dead links for visual verification.'),
});
exports.DetectBrokenLinksGptSchema = v4_1.z.object({
...ToolSchema_1.BaseGptArgsSchema.shape,
...exports.DetectBrokenLinksCoreSchema.shape,
});
class DetectBrokenLinksTool extends Tool_1.Tool {
constructor() {
super(DetectBrokenLinksTool.NAME, 'Detect broken links in the current page and return a report of them.', exports.DetectBrokenLinksCoreSchema, exports.DetectBrokenLinksGptSchema, false, undefined, ['web']);
}
async call(context, parameters) {
const page = (0, TargetUtils_1.webPage)(context);
// Get all links along with their texts.
const links = await DetectBrokenLinksTool.getLinksForPage(page);
const linksData = await DetectBrokenLinksTool.traverseLinks(page, links);
// Optionally capture screenshots for dead links.
if (parameters.captureScreenshots) {
await Promise.all(Array.from(linksData.entries()).map(async ([originalUrl, result]) => {
// Only try to capture screenshot for non-skipped (i.e. HTTP) links
if (result.isDead && !result.skipped) {
let newPage;
try {
const finalUrl = result.redirectChain[result.redirectChain.length - 1];
newPage = await page.context().newPage();
await newPage.goto(finalUrl, { timeout: 10000 });
const screenshotBuffer = await PlaywrightUtils_1.PlaywrightUtils.takeViewportScreenshot(newPage);
const imageType = MiscUtils_1.MiscUtils.detectImageType(screenshotBuffer);
// Generate a filename (encoded original URL plus timestamp)
const screenshotFilename = `dead-link-screenshot-${encodeURIComponent(originalUrl)}-${Date.now()}.${imageType}`;
await context.persistence.setFlowFile(context.metadata.id, screenshotFilename, screenshotBuffer);
result.screenshotFilename = screenshotFilename;
}
catch (_error) {
// Log error or ignore if screenshot capture fails.
}
finally {
if (newPage) {
await newPage.close().catch(() => { });
}
}
}
}));
}
const linksReport = await DetectBrokenLinksTool.generateJsonReport(page, linksData);
const linksReportAsJson = JSON.stringify(linksReport, null, 2);
return {
isSuccessful: true,
forLlm: linksReportAsJson,
metadata: linksReport,
};
}
async callFromGpt(context, parameters) {
return this.call(context, parameters);
}
/**
* Returns an array of LinkInfo objects containing a URL and an array of all associated link texts.
* Filters out invalid URLs.
*/
static async getLinksForPage(page) {
// Get all <a> tag links.
const aTagLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map((a) => ({
url: a.href,
text: (a.textContent || '').trim(),
}));
});
// Get links from onclick attributes.
const onClickLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('[onclick]'))
.map((el) => {
const onclick = el.getAttribute('onclick') || '';
const match = onclick.match(/window\.location\.href\s*=\s*['"]([^'"]+)['"]/);
return match
? { url: match[1], text: (el.textContent || '').trim() }
: null;
})
.filter((link) => link !== null);
});
// Get links from data-href attributes.
const dataHrefLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('[data-href]')).map((el) => ({
url: el.getAttribute('data-href') || '',
text: (el.textContent || '').trim(),
}));
});
// Combine and deduplicate links by URL, accumulating all texts.
const linksMap = new Map();
const allLinks = [...aTagLinks, ...onClickLinks, ...dataHrefLinks];
for (const link of allLinks) {
try {
const _parsed = new URL(link.url);
if (!linksMap.has(link.url)) {
linksMap.set(link.url, { url: link.url, texts: [link.text] });
}
else {
const entry = linksMap.get(link.url);
if (link.text && !entry.texts.includes(link.text)) {
entry.texts.push(link.text);
}
}
}
catch {
// Ignore invalid URLs.
// TODO: add better logic for handling different URL types like file://, etc.
}
}
return Array.from(linksMap.values());
}
/**
* Checks all links concurrently and returns a map from the original URL to its traversal result.
* If a link is a mailto, it is marked as skipped.
*/
static async traverseLinks(page, links) {
const results = new Map();
const maxRedirects = 5;
await Promise.all(links.map(async (link) => {
// If it's a mailto link, record it as skipped.
if (link.url.startsWith('mailto:')) {
results.set(link.url, {
redirectChain: [link.url],
responseBody: 'Mailto link: not traversed',
responseCode: 0,
isDead: false,
skipped: true,
linkTexts: link.texts,
});
return;
}
try {
const result = await DetectBrokenLinksTool.traverseSingleLink(page, link.url, maxRedirects);
results.set(link.url, { ...result, linkTexts: link.texts });
}
catch (error) {
results.set(link.url, {
redirectChain: [link.url],
responseBody: error instanceof Error ? error.toString() : String(error),
responseCode: 0,
isDead: true,
linkTexts: link.texts,
});
}
}));
return results;
}
/**
* Traverses a single link, following redirects up to maxRedirects.
* Returns a LinkTraversalResult that includes the full redirect chain.
*/
static async traverseSingleLink(page, link, maxRedirects) {
let currentUrl = link;
const redirectChain = [link];
let redirectCount = 0;
while (redirectCount < maxRedirects) {
const response = await page.context().request.get(currentUrl, {
maxRedirects: 0,
timeout: 10000,
failOnStatusCode: false,
});
const responseCode = response.status();
// Handle redirects.
if (responseCode >= 300 && responseCode < 400) {
const location = response.headers()['location'];
if (!location) {
// No location header — cannot follow redirect.
return {
redirectChain,
responseBody: '',
responseCode,
isDead: true,
};
}
try {
// Resolve relative URLs using currentUrl as base.
currentUrl = new URL(location, currentUrl).toString();
redirectChain.push(currentUrl);
redirectCount++;
continue;
}
catch {
return {
redirectChain,
responseBody: '',
responseCode,
isDead: true,
};
}
}
else {
// Instead of using the raw response body, load the page in a temporary context
// and extract the visible (rendered) text only.
let visibleText = '';
let tempPage;
try {
tempPage = await page.context().newPage();
await tempPage.goto(currentUrl, { timeout: 10000 });
visibleText = await tempPage.locator('body').innerText({
timeout: 5000,
});
}
catch (_error) {
// In case of any error during page load, visibleText remains empty.
}
finally {
if (tempPage) {
await tempPage.close().catch(() => { });
}
}
const lowerVisibleText = visibleText.toLowerCase();
// Define the textual indicators which suggest a broken (dead) page.
const deadLinkIndicators = [
'page not found',
'404 error',
'content not available',
];
// A link is marked as dead if the HTTP status is 404, 410, any 5xx,
// or if any error-indicating phrase is present in the visible text.
const isDead = responseCode === 404 ||
responseCode === 410 ||
responseCode >= 500 ||
deadLinkIndicators.some((indicator) => lowerVisibleText.includes(indicator));
return {
redirectChain,
responseBody: visibleText, // now contains only the visible text
responseCode,
isDead,
};
}
}
// If max redirects reached without a final response, mark as dead.
return {
redirectChain,
responseBody: '',
responseCode: 0,
isDead: true,
};
}
/**
* Generates a JSON report of the link scan.
* The report now includes a "linkTexts" field for each link.
*/
static async generateJsonReport(page, linksData) {
const currentPageUrl = page.url();
const deadLinks = [];
const workingLinks = [];
linksData.forEach((result, originalUrl) => {
const finalUrl = result.redirectChain[result.redirectChain.length - 1];
const linkData = {
originalUrl,
linkTexts: result.linkTexts, // All accumulated texts.
finalUrl,
responseCode: result.responseCode,
isDead: result.isDead,
redirectChain: result.redirectChain,
error: result.responseCode === 0 ? result.responseBody : undefined,
};
if (result.screenshotFilename) {
linkData.screenshot = result.screenshotFilename;
}
if (result.skipped) {
linkData.skipped = true;
}
if (result.isDead) {
deadLinks.push(linkData);
}
else {
workingLinks.push(linkData);
}
});
return {
scanDateTime: new Date().toISOString(),
sourcePageUrl: currentPageUrl,
summary: {
totalLinks: linksData.size,
deadLinks: deadLinks.length,
workingLinks: workingLinks.length,
},
deadLinks,
workingLinks,
};
}
}
exports.DetectBrokenLinksTool = DetectBrokenLinksTool;
DetectBrokenLinksTool.NAME = 'detectBrokenLinks';
//# sourceMappingURL=DetectBrokenLinksTool.js.map