dtamind-components
Version:
Apps integration for Dtamind. Contain Nodes and Credentials.
402 lines • 17.4 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const utils_1 = require("../../../src/utils");
const tools_1 = require("@langchain/core/tools");
const node_fetch_1 = __importDefault(require("node-fetch"));
const cheerio = __importStar(require("cheerio"));
const url_1 = require("url");
const utils_2 = require("../../../src/utils");
class WebScraperRecursiveTool extends tools_1.Tool {
constructor(maxDepth = 1, maxPages = 10, timeoutMs = 60000, useSitemap = false) {
super();
this.name = 'web_scraper_tool';
this.description = `Scrapes web pages recursively or via default sitemap. Extracts title, description, and paragraph text. Input should be a single URL string. Returns a JSON string array of scraped page data objects.`;
this.maxDepth = Math.max(1, maxDepth);
this.maxPages = maxPages !== null && maxPages > 0 ? maxPages : null;
this.timeoutMs = timeoutMs > 0 ? timeoutMs : 60000;
this.useSitemap = useSitemap;
this.visitedUrls = new Set();
this.scrapedPagesCount = 0;
let desc = '';
if (this.useSitemap) {
desc = `Scrapes URLs listed in the detected default sitemap (/sitemap.xml)`;
if (this.maxPages !== null) {
desc += ` up to ${this.maxPages} pages`;
}
desc += `, with a ${this.timeoutMs / 1000}-second timeout per page. Falls back to Recursive Link Following if sitemap is not found or empty.`;
}
else {
desc = `Recursively scrapes web pages starting from a given URL`;
if (this.maxDepth > 0) {
desc += ` up to ${this.maxDepth} level(s) deep`;
}
if (this.maxPages !== null) {
desc += ` or until ${this.maxPages} pages are scraped`;
}
desc += `, with a ${this.timeoutMs / 1000}-second timeout per page, whichever comes first.`;
}
desc += ` Extracts title, description, and paragraph text. Input should be a single URL string. Returns a JSON string array of scraped page data.`;
this.description = desc;
}
async scrapeSingleUrl(url) {
try {
const response = await (0, node_fetch_1.default)(url, { timeout: this.timeoutMs, redirect: 'follow', follow: 5 });
if (!response.ok) {
const errorText = await response.text();
return {
title: '',
description: '',
body_text: '',
foundLinks: [],
error: `HTTP Error: ${response.status} ${response.statusText}. ${errorText}`
};
}
const contentType = response.headers.get('content-type');
if (contentType === null) {
return {
title: '',
description: '',
body_text: '',
foundLinks: [],
error: `Skipped content due to missing Content-Type header`
};
}
if (!contentType.includes('text/html') && url !== this.visitedUrls.values().next().value) {
if (!contentType.includes('text/xml') && !contentType.includes('application/xml')) {
return {
title: '',
description: '',
body_text: '',
foundLinks: [],
error: `Skipped non-HTML/XML content (Content-Type: ${contentType})`
};
}
if (!contentType.includes('text/html')) {
return {
title: '',
description: '',
body_text: '',
foundLinks: [],
error: `Skipped non-HTML content (Content-Type: ${contentType})`
};
}
}
const html = await response.text();
const $ = cheerio.load(html);
const title = $('title').first().text() || 'No title found';
let description = $('meta[name="description"]').attr('content') ||
$('meta[property="og:description"]').attr('content') ||
$('meta[name="twitter:description"]').attr('content') ||
'No description found';
const paragraphs = [];
$('p').each((_i, elem) => {
const paragraphText = $(elem).text();
if (paragraphText) {
paragraphs.push(paragraphText.trim());
}
});
const body_text = paragraphs.join(' ').replace(/\s\s+/g, ' ').trim();
const foundLinks = [];
$('a').each((_i, elem) => {
const href = $(elem).attr('href');
if (href) {
try {
const absoluteUrl = new url_1.URL(href, url).toString();
if (absoluteUrl.startsWith('http') && !absoluteUrl.includes('#')) {
foundLinks.push(absoluteUrl);
}
}
catch (e) {
// Ignore invalid URLs
}
}
});
return {
title: title.trim(),
description: description.trim(),
body_text: body_text,
foundLinks: [...new Set(foundLinks)]
};
}
catch (error) {
if (error.type === 'request-timeout') {
return {
title: '',
description: '',
body_text: '',
foundLinks: [],
error: `Scraping Error: Request Timeout after ${this.timeoutMs}ms`
};
}
return {
title: '',
description: '',
body_text: '',
foundLinks: [],
error: `Scraping Error: ${error?.message || 'Unknown error'}`
};
}
}
async scrapeRecursive(url, currentDepth) {
if (this.maxPages !== null && this.scrapedPagesCount >= this.maxPages) {
return [];
}
if (currentDepth > this.maxDepth) {
return [];
}
if (this.visitedUrls.has(url)) {
return [];
}
try {
new url_1.URL(url);
if (!url.startsWith('http'))
throw new Error('Invalid protocol');
}
catch (e) {
if (this.maxPages !== null) {
this.scrapedPagesCount++;
}
return [{ url, title: '', description: '', body_text: '', error: `Invalid URL format or protocol` }];
}
this.visitedUrls.add(url);
if (this.maxPages !== null) {
this.scrapedPagesCount++;
}
const { foundLinks, ...scrapedContent } = await this.scrapeSingleUrl(url);
const currentPageData = { url, ...scrapedContent };
let results = [currentPageData];
if (!currentPageData.error && currentDepth < this.maxDepth && (this.maxPages === null || this.scrapedPagesCount < this.maxPages)) {
const recursivePromises = [];
for (const link of foundLinks) {
if (this.maxPages !== null && this.scrapedPagesCount >= this.maxPages) {
break;
}
if (!this.visitedUrls.has(link)) {
recursivePromises.push(this.scrapeRecursive(link, currentDepth + 1));
}
}
if (recursivePromises.length > 0) {
const nestedResults = await Promise.all(recursivePromises);
results = results.concat(...nestedResults);
}
}
else if (currentPageData.error) {
// Do nothing if there was an error scraping the current page
}
return results;
}
async scrapeUrlsFromList(urlList) {
const results = [];
const scrapePromises = [];
for (const url of urlList) {
if (this.maxPages !== null && this.scrapedPagesCount >= this.maxPages) {
break;
}
if (this.visitedUrls.has(url)) {
continue;
}
this.visitedUrls.add(url);
this.scrapedPagesCount++;
const promise = (async () => {
const { foundLinks: _ignoreLinks, ...scrapedContent } = await this.scrapeSingleUrl(url);
results.push({ url, ...scrapedContent });
})();
scrapePromises.push(promise);
}
await Promise.all(scrapePromises);
return results.slice(0, this.maxPages ?? results.length);
}
async _call(initialInput) {
this.visitedUrls = new Set();
this.scrapedPagesCount = 0;
let performedFallback = false;
let sitemapAttempted = false;
if (!initialInput || typeof initialInput !== 'string') {
return JSON.stringify({ error: 'Input must be a single URL string.' });
}
try {
let allScrapedData = [];
let urlsFromSitemap = [];
if (this.useSitemap) {
sitemapAttempted = true;
let sitemapUrlToFetch = undefined;
try {
const baseUrl = new url_1.URL(initialInput);
sitemapUrlToFetch = new url_1.URL('/sitemap.xml', baseUrl.origin).toString();
}
catch (e) {
return JSON.stringify({ error: 'Invalid initial URL provided for sitemap detection.' });
}
if (!sitemapUrlToFetch) {
return JSON.stringify({ error: 'Could not determine sitemap URL.' });
}
try {
const limitParam = this.maxPages === null ? Infinity : this.maxPages;
urlsFromSitemap = await (0, utils_2.xmlScrape)(sitemapUrlToFetch, limitParam);
}
catch (sitemapError) {
urlsFromSitemap = [];
}
if (urlsFromSitemap.length > 0) {
allScrapedData = await this.scrapeUrlsFromList(urlsFromSitemap);
}
else {
performedFallback = true;
}
}
if (!sitemapAttempted || performedFallback) {
allScrapedData = await this.scrapeRecursive(initialInput, 1);
}
if (this.maxPages !== null && this.scrapedPagesCount >= this.maxPages) {
// Log or indicate that the max page limit was reached during scraping
}
if (performedFallback) {
const warningResult = {
warning: 'Sitemap not found or empty; fell back to recursive scraping.',
scrapedData: allScrapedData
};
return JSON.stringify(warningResult);
}
else {
return JSON.stringify(allScrapedData);
}
}
catch (error) {
return JSON.stringify({ error: `Failed scrape operation: ${error?.message || 'Unknown error'}` });
}
}
}
class WebScraperRecursive_Tools {
constructor() {
this.label = 'Web Scraper Tool';
this.name = 'webScraperTool';
this.version = 1.1;
this.type = 'Tool';
this.icon = 'webScraperTool.svg';
this.category = 'Tools';
this.description = 'Scrapes web pages recursively by following links OR by fetching URLs from the default sitemap.';
this.baseClasses = [this.type, ...(0, utils_1.getBaseClasses)(WebScraperRecursiveTool)];
this.inputs = [
{
label: 'Scraping Mode',
name: 'scrapeMode',
type: 'options',
options: [
{ label: 'Recursive Link Following', name: 'recursive' },
{ label: 'Sitemap', name: 'sitemap' }
],
default: 'recursive',
description: "Select discovery method: 'Recursive' follows links found on pages (uses Max Depth). 'Sitemap' tries sitemap.xml first, but falls back to 'Recursive' if the sitemap is not found or empty.",
additionalParams: true
},
{
label: 'Max Depth',
name: 'maxDepth',
type: 'number',
description: 'Maximum levels of links to follow (e.g., 1 = only the initial URL, 2 = initial URL + links found on it). Default 1.',
placeholder: '1',
default: 1,
optional: true,
additionalParams: true
},
{
label: 'Max Pages',
name: 'maxPages',
type: 'number',
description: 'Maximum total number of pages to scrape, regardless of mode or depth. Stops when this limit is reached. Leave empty for no page limit. Default: 10.',
placeholder: '10',
default: 10,
optional: true,
additionalParams: true
},
{
label: 'Timeout (s)',
name: 'timeoutS',
type: 'number',
description: 'Maximum time in seconds to wait for each page request to complete. Accepts decimals (e.g., 0.5). Default 60.',
placeholder: '60',
default: 60,
optional: true,
additionalParams: true
},
{
label: 'Tool Description',
name: 'description',
type: 'string',
description: 'Custom description of what the tool does. This is for LLM to determine when to use this tool. Overrides the default description.',
rows: 4,
additionalParams: true,
optional: true,
placeholder: `Scrapes web pages recursively or via default sitemap. Extracts title, description, and paragraph text. Input should be a single URL string. Returns a JSON string array of scraped page data objects.`
}
];
}
async init(nodeData, _, _options) {
const scrapeMode = nodeData.inputs?.scrapeMode ?? 'recursive';
const useSitemap = scrapeMode === 'sitemap';
const maxDepthInput = nodeData.inputs?.maxDepth;
let maxDepth = 1;
if (maxDepthInput !== undefined && maxDepthInput !== '') {
const parsedDepth = parseInt(String(maxDepthInput), 10);
if (!isNaN(parsedDepth) && parsedDepth > 0) {
maxDepth = parsedDepth;
}
}
const maxPagesInput = nodeData.inputs?.maxPages;
let maxPages = 10;
if (maxPagesInput === undefined || maxPagesInput === '') {
maxPages = null;
}
else {
const parsedPages = parseInt(String(maxPagesInput), 10);
if (!isNaN(parsedPages) && parsedPages > 0) {
maxPages = parsedPages;
}
else if (parsedPages <= 0) {
maxPages = null;
}
}
const timeoutInputS = nodeData.inputs?.timeoutS;
let timeoutMs = 60000;
if (timeoutInputS !== undefined && timeoutInputS !== '') {
const parsedTimeoutS = parseFloat(String(timeoutInputS));
if (!isNaN(parsedTimeoutS) && parsedTimeoutS > 0) {
timeoutMs = Math.round(parsedTimeoutS * 1000);
}
}
const customDescription = nodeData.inputs?.description;
const tool = new WebScraperRecursiveTool(maxDepth, maxPages, timeoutMs, useSitemap);
if (customDescription) {
tool.description = customDescription;
}
return tool;
}
}
module.exports = { nodeClass: WebScraperRecursive_Tools };
//# sourceMappingURL=WebScraperTool.js.map