UNPKG

@langchain/community

Version:
154 lines (153 loc) 5.2 kB
import { __exportAll } from "../../_virtual/_rolldown/runtime.js"; import { AsyncCaller } from "@langchain/core/utils/async_caller"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; import { JSDOM, VirtualConsole } from "jsdom"; import { isSameOrigin, validateSafeUrl } from "@langchain/core/utils/ssrf"; //#region src/document_loaders/web/recursive_url.ts var recursive_url_exports = /* @__PURE__ */ __exportAll({ RecursiveUrlLoader: () => RecursiveUrlLoader }); const virtualConsole = new VirtualConsole(); virtualConsole.on("error", () => {}); const MAX_REDIRECTS = 10; const REDIRECT_CODES = new Set([ 301, 302, 303, 307, 308 ]); var RecursiveUrlLoader = class extends BaseDocumentLoader { caller; url; excludeDirs; extractor; maxDepth; timeout; preventOutside; constructor(url, options) { super(); this.caller = new AsyncCaller({ maxConcurrency: 64, maxRetries: 0, ...options.callerOptions }); this.url = url; this.excludeDirs = options.excludeDirs ?? []; this.extractor = options.extractor ?? ((s) => s); this.maxDepth = options.maxDepth ?? 2; this.timeout = options.timeout ?? 1e4; this.preventOutside = options.preventOutside ?? true; } async fetchWithTimeout(resource, options) { const { timeout, ...rest } = options; let currentUrl = resource; for (let i = 0; i <= MAX_REDIRECTS; i++) { validateSafeUrl(currentUrl, { allowHttp: true }); const response = await this.caller.call(() => fetch(currentUrl, { ...rest, redirect: "manual", signal: AbortSignal.timeout(timeout) })); if (REDIRECT_CODES.has(response.status)) { const location = response.headers.get("location"); if (!location) throw new Error("Redirect response missing Location header"); currentUrl = new URL(location, currentUrl).href; continue; } return response; } throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`); } getChildLinks(html, baseUrl) { const allLinks = Array.from(new JSDOM(html, { virtualConsole }).window.document.querySelectorAll("a")).map((a) => a.href); const absolutePaths = []; const invalidPrefixes = [ "javascript:", "mailto:", "#" ]; const invalidSuffixes = [ ".css", ".js", ".ico", ".png", ".jpg", ".jpeg", ".gif", ".svg" ]; for (const link of allLinks) { if (invalidPrefixes.some((prefix) => link.startsWith(prefix)) || invalidSuffixes.some((suffix) => link.endsWith(suffix))) continue; let standardizedLink; if (link.startsWith("http")) standardizedLink = link; else if (link.startsWith("//")) standardizedLink = new URL(baseUrl).protocol + link; else standardizedLink = new URL(link, baseUrl).href; if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir))) continue; if (link.startsWith("http")) { if (!this.preventOutside || isSameOrigin(link, baseUrl)) absolutePaths.push(link); } else if (link.startsWith("//")) { const base = new URL(baseUrl); absolutePaths.push(base.protocol + link); } else { const newLink = new URL(link, baseUrl).href; absolutePaths.push(newLink); } } return Array.from(new Set(absolutePaths)); } extractMetadata(rawHtml, url) { const metadata = { source: url }; const { document } = new JSDOM(rawHtml, { virtualConsole }).window; const title = document.getElementsByTagName("title")[0]; if (title) metadata.title = title.textContent; const description = document.querySelector("meta[name=description]"); if (description) metadata.description = description.getAttribute("content"); const html = document.getElementsByTagName("html")[0]; if (html) metadata.language = html.getAttribute("lang"); return metadata; } async getUrlAsDoc(url) { let res; try { res = await this.fetchWithTimeout(url, { timeout: this.timeout }); res = await res.text(); } catch { return null; } return { pageContent: this.extractor(res), metadata: this.extractMetadata(res, url) }; } async getChildUrlsRecursive(inputUrl, visited = /* @__PURE__ */ new Set(), depth = 0) { if (depth >= this.maxDepth) return []; let url = inputUrl; if (!inputUrl.endsWith("/")) url += "/"; if (this.excludeDirs.some((exDir) => url.startsWith(exDir))) return []; let res; try { res = await this.fetchWithTimeout(url, { timeout: this.timeout }); res = await res.text(); } catch { return []; } const childUrls = this.getChildLinks(res, url); return (await Promise.all(childUrls.map((childUrl) => (async () => { if (visited.has(childUrl)) return null; visited.add(childUrl); const childDoc = await this.getUrlAsDoc(childUrl); if (!childDoc) return null; if (childUrl.endsWith("/")) return [childDoc, ...await this.getChildUrlsRecursive(childUrl, visited, depth + 1)]; return [childDoc]; })()))).flat().filter((docs) => docs !== null); } async load() { const rootDoc = await this.getUrlAsDoc(this.url); if (!rootDoc) return []; const docs = [rootDoc]; docs.push(...await this.getChildUrlsRecursive(this.url, new Set([this.url]))); return docs; } }; //#endregion export { RecursiveUrlLoader, recursive_url_exports }; //# sourceMappingURL=recursive_url.js.map