donobu
Version:
Create browser automations with an LLM agent and replay them as Playwright scripts.
264 lines • 11 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.DetectBrokenLinksTool = void 0;
const Tool_1 = require("./Tool");
const PlaywrightUtils_1 = require("../utils/PlaywrightUtils");
class DetectBrokenLinksTool extends Tool_1.Tool {
constructor() {
super(DetectBrokenLinksTool.NAME, 'Detect broken links in the current page and return a report of them.', 'DetectBrokenLinksToolCoreParameters', 'DetectBrokenLinksToolGptParameters');
}
async call(context, parameters) {
const page = context.page;
// Get all links along with their texts.
const links = await DetectBrokenLinksTool.getLinksForPage(page);
const linksData = await DetectBrokenLinksTool.traverseLinks(page, links);
// Optionally capture screenshots for dead links.
if (parameters.captureScreenshots) {
await Promise.all(Array.from(linksData.entries()).map(async ([originalUrl, result]) => {
// Only try to capture screenshot for non-skipped (i.e. HTTP) links
if (result.isDead && !result.skipped) {
let newPage;
try {
const finalUrl = result.redirectChain[result.redirectChain.length - 1];
newPage = await page.context().newPage();
await newPage.goto(finalUrl, { timeout: 10000 });
const screenshotBuffer = await PlaywrightUtils_1.PlaywrightUtils.takePngScreenshot(newPage);
// Generate a filename (encoded original URL plus timestamp)
const screenshotFilename = `dead-link-screenshot-${encodeURIComponent(originalUrl)}-${Date.now()}.png`;
await context.persistence.setFlowFile(context.metadata.id, screenshotFilename, screenshotBuffer);
result.screenshotFilename = screenshotFilename;
}
catch (_error) {
// Log error or ignore if screenshot capture fails.
}
finally {
if (newPage) {
await newPage.close().catch(() => { });
}
}
}
}));
}
const linksReport = await DetectBrokenLinksTool.generateJsonReport(page, linksData);
const linksReportAsJson = JSON.stringify(linksReport, null, 2);
return {
isSuccessful: true,
forLlm: linksReportAsJson,
metadata: linksReport,
};
}
async callFromGpt(context, parameters) {
return this.call(context, parameters);
}
/**
* Returns an array of LinkInfo objects containing a URL and an array of all associated link texts.
* Filters out invalid URLs.
*/
static async getLinksForPage(page) {
// Get all <a> tag links.
const aTagLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map((a) => ({
url: a.href,
text: (a.textContent || '').trim(),
}));
});
// Get links from onclick attributes.
const onClickLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('[onclick]'))
.map((el) => {
const onclick = el.getAttribute('onclick') || '';
const match = onclick.match(/window\.location\.href\s*=\s*['"]([^'"]+)['"]/);
return match
? { url: match[1], text: (el.textContent || '').trim() }
: null;
})
.filter((link) => link !== null);
});
// Get links from data-href attributes.
const dataHrefLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('[data-href]')).map((el) => ({
url: el.getAttribute('data-href') || '',
text: (el.textContent || '').trim(),
}));
});
// Combine and deduplicate links by URL, accumulating all texts.
const linksMap = new Map();
const allLinks = [...aTagLinks, ...onClickLinks, ...dataHrefLinks];
for (const link of allLinks) {
try {
const _parsed = new URL(link.url);
if (!linksMap.has(link.url)) {
linksMap.set(link.url, { url: link.url, texts: [link.text] });
}
else {
const entry = linksMap.get(link.url);
if (link.text && !entry.texts.includes(link.text)) {
entry.texts.push(link.text);
}
}
}
catch {
// Ignore invalid URLs.
// TODO: add better logic for handling different URL types like file://, etc.
}
}
return Array.from(linksMap.values());
}
/**
* Checks all links concurrently and returns a map from the original URL to its traversal result.
* If a link is a mailto, it is marked as skipped.
*/
static async traverseLinks(page, links) {
const results = new Map();
const maxRedirects = 5;
await Promise.all(links.map(async (link) => {
// If it's a mailto link, record it as skipped.
if (link.url.startsWith('mailto:')) {
results.set(link.url, {
redirectChain: [link.url],
responseBody: 'Mailto link: not traversed',
responseCode: 0,
isDead: false,
skipped: true,
linkTexts: link.texts,
});
return;
}
try {
const result = await this.traverseSingleLink(page, link.url, maxRedirects);
results.set(link.url, { ...result, linkTexts: link.texts });
}
catch (error) {
results.set(link.url, {
redirectChain: [link.url],
responseBody: error instanceof Error ? error.toString() : String(error),
responseCode: 0,
isDead: true,
linkTexts: link.texts,
});
}
}));
return results;
}
/**
* Traverses a single link, following redirects up to maxRedirects.
* Returns a LinkTraversalResult that includes the full redirect chain.
*/
static async traverseSingleLink(page, link, maxRedirects) {
let currentUrl = link;
const redirectChain = [link];
let redirectCount = 0;
while (redirectCount < maxRedirects) {
const response = await page.context().request.get(currentUrl, {
maxRedirects: 0,
timeout: 10000,
failOnStatusCode: false,
});
const responseBody = await response.text();
const responseCode = response.status();
// TODO: Define better and more specific textual indicators.
const deadLinkIndicators = [
'page not found',
'404 error',
'content not available',
];
const lowerBody = responseBody.toLowerCase();
// Mark as dead if status code is 404, 410, or >=500,
// or if one of the more specific phrases is found.
const isDead = responseCode === 404 ||
responseCode === 410 ||
responseCode >= 500 ||
deadLinkIndicators.some((indicator) => lowerBody.includes(indicator));
// Handle redirects.
if (responseCode >= 300 && responseCode < 400) {
const location = response.headers()['location'];
if (!location) {
// No location header — cannot follow redirect.
return {
redirectChain,
responseBody,
responseCode,
isDead: true,
};
}
try {
// Using new URL with currentUrl as base handles relative URLs.
currentUrl = new URL(location, currentUrl).toString();
redirectChain.push(currentUrl);
redirectCount++;
continue;
}
catch {
return {
redirectChain,
responseBody,
responseCode,
isDead: true,
};
}
}
else {
return {
redirectChain,
responseBody,
responseCode,
isDead,
};
}
}
// If max redirects reached without a final response, mark as dead.
return {
redirectChain,
responseBody: '',
responseCode: 0,
isDead: true,
};
}
/**
* Generates a JSON report of the link scan.
* The report now includes a "linkTexts" field for each link.
*/
static async generateJsonReport(page, linksData) {
const currentPageUrl = page.url();
const deadLinks = [];
const workingLinks = [];
linksData.forEach((result, originalUrl) => {
const finalUrl = result.redirectChain[result.redirectChain.length - 1];
const linkData = {
originalUrl,
linkTexts: result.linkTexts, // All accumulated texts.
finalUrl,
responseCode: result.responseCode,
isDead: result.isDead,
redirectChain: result.redirectChain,
error: result.responseCode === 0 ? result.responseBody : undefined,
};
if (result.screenshotFilename) {
linkData.screenshot = result.screenshotFilename;
}
if (result.skipped) {
linkData.skipped = true;
}
if (result.isDead) {
deadLinks.push(linkData);
}
else {
workingLinks.push(linkData);
}
});
return {
scanDateTime: new Date().toISOString(),
sourcePageUrl: currentPageUrl,
summary: {
totalLinks: linksData.size,
deadLinks: deadLinks.length,
workingLinks: workingLinks.length,
},
deadLinks,
workingLinks,
};
}
}
exports.DetectBrokenLinksTool = DetectBrokenLinksTool;
DetectBrokenLinksTool.NAME = 'detectBrokenLinks';
//# sourceMappingURL=DetectBrokenLinksTool.js.map