@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 10.5 kB
Source Map (JSON)
{"version":3,"file":"recursive_url.cjs","names":["VirtualConsole","BaseDocumentLoader","AsyncCaller","JSDOM"],"sources":["../../../src/document_loaders/web/recursive_url.ts"],"sourcesContent":["import { JSDOM, VirtualConsole } from \"jsdom\";\nimport { Document } from \"@langchain/core/documents\";\nimport { AsyncCaller } from \"@langchain/core/utils/async_caller\";\nimport { isSameOrigin, validateSafeUrl } from \"@langchain/core/utils/ssrf\";\nimport {\n BaseDocumentLoader,\n DocumentLoader,\n} from \"@langchain/core/document_loaders/base\";\n\nconst virtualConsole = new VirtualConsole();\nvirtualConsole.on(\"error\", () => {});\n\nconst MAX_REDIRECTS = 10;\nconst REDIRECT_CODES = new Set([301, 302, 303, 307, 308]);\n\nexport interface RecursiveUrlLoaderOptions {\n excludeDirs?: string[];\n extractor?: (text: string) => string;\n maxDepth?: number;\n timeout?: number;\n preventOutside?: boolean;\n callerOptions?: ConstructorParameters<typeof AsyncCaller>[0];\n}\n\nexport class RecursiveUrlLoader\n extends BaseDocumentLoader\n implements DocumentLoader\n{\n private caller: AsyncCaller;\n\n private url: string;\n\n private excludeDirs: string[];\n\n private extractor: (text: string) => string;\n\n private maxDepth: number;\n\n private timeout: number;\n\n private preventOutside: boolean;\n\n constructor(url: string, options: RecursiveUrlLoaderOptions) {\n super();\n\n this.caller = new AsyncCaller({\n maxConcurrency: 64,\n maxRetries: 0,\n ...options.callerOptions,\n });\n\n this.url = url;\n this.excludeDirs = options.excludeDirs ?? [];\n this.extractor = options.extractor ?? ((s: string) => s);\n this.maxDepth = options.maxDepth ?? 2;\n this.timeout = options.timeout ?? 10000;\n this.preventOutside = options.preventOutside ?? true;\n }\n\n private async fetchWithTimeout(\n resource: string,\n options: { timeout: number } & RequestInit\n ): Promise<Response> {\n const { timeout, ...rest } = options;\n let currentUrl = resource;\n\n for (let i = 0; i <= MAX_REDIRECTS; i++) {\n validateSafeUrl(currentUrl, { allowHttp: true });\n\n const response = await this.caller.call(() =>\n fetch(currentUrl, {\n ...rest,\n redirect: \"manual\",\n signal: AbortSignal.timeout(timeout),\n })\n );\n\n if (REDIRECT_CODES.has(response.status)) {\n const location = response.headers.get(\"location\");\n if (!location) {\n throw new Error(\"Redirect response missing Location header\");\n }\n currentUrl = new URL(location, currentUrl).href;\n continue;\n }\n\n return response;\n }\n\n throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`);\n }\n\n private getChildLinks(html: string, baseUrl: string): Array<string> {\n const allLinks = Array.from(\n new JSDOM(html, { virtualConsole }).window.document.querySelectorAll(\"a\")\n ).map((a) => a.href);\n const absolutePaths = [];\n const invalidPrefixes = [\"javascript:\", \"mailto:\", \"#\"];\n const invalidSuffixes = [\n \".css\",\n \".js\",\n \".ico\",\n \".png\",\n \".jpg\",\n \".jpeg\",\n \".gif\",\n \".svg\",\n ];\n\n for (const link of allLinks) {\n if (\n invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||\n invalidSuffixes.some((suffix) => link.endsWith(suffix))\n )\n continue;\n\n let standardizedLink: string;\n\n if (link.startsWith(\"http\")) {\n standardizedLink = link;\n } else if (link.startsWith(\"//\")) {\n const base = new URL(baseUrl);\n standardizedLink = base.protocol + link;\n } else {\n standardizedLink = new URL(link, baseUrl).href;\n }\n\n if (this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)))\n continue;\n\n if (link.startsWith(\"http\")) {\n const isAllowed = !this.preventOutside || isSameOrigin(link, baseUrl);\n if (isAllowed) absolutePaths.push(link);\n } else if (link.startsWith(\"//\")) {\n const base = new URL(baseUrl);\n absolutePaths.push(base.protocol + link);\n } else {\n const newLink = new URL(link, baseUrl).href;\n absolutePaths.push(newLink);\n }\n }\n\n return Array.from(new Set(absolutePaths));\n }\n\n private extractMetadata(rawHtml: string, url: string) {\n // oxlint-disable-next-line typescript/no-explicit-any\n const metadata: Record<string, any> = { source: url };\n const { document } = new JSDOM(rawHtml, { virtualConsole }).window;\n\n const title = document.getElementsByTagName(\"title\")[0];\n if (title) {\n metadata.title = title.textContent;\n }\n\n const description = document.querySelector(\"meta[name=description]\");\n if (description) {\n metadata.description = description.getAttribute(\"content\");\n }\n\n const html = document.getElementsByTagName(\"html\")[0];\n if (html) {\n metadata.language = html.getAttribute(\"lang\");\n }\n\n return metadata;\n }\n\n private async getUrlAsDoc(url: string): Promise<Document | null> {\n let res;\n try {\n res = await this.fetchWithTimeout(url, { timeout: this.timeout });\n res = await res.text();\n } catch {\n return null;\n }\n\n return {\n pageContent: this.extractor(res),\n metadata: this.extractMetadata(res, url),\n };\n }\n\n private async getChildUrlsRecursive(\n inputUrl: string,\n visited: Set<string> = new Set<string>(),\n depth = 0\n ): Promise<Document[]> {\n if (depth >= this.maxDepth) return [];\n\n let url = inputUrl;\n if (!inputUrl.endsWith(\"/\")) url += \"/\";\n\n const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir));\n if (isExcluded) return [];\n\n let res;\n try {\n res = await this.fetchWithTimeout(url, { timeout: this.timeout });\n res = await res.text();\n } catch {\n return [];\n }\n\n const childUrls: string[] = this.getChildLinks(res, url);\n\n const results = await Promise.all(\n childUrls.map((childUrl) =>\n (async () => {\n if (visited.has(childUrl)) return null;\n visited.add(childUrl);\n\n const childDoc = await this.getUrlAsDoc(childUrl);\n if (!childDoc) return null;\n\n if (childUrl.endsWith(\"/\")) {\n const childUrlResponses = await this.getChildUrlsRecursive(\n childUrl,\n visited,\n depth + 1\n );\n return [childDoc, ...childUrlResponses];\n }\n\n return [childDoc];\n })()\n )\n );\n\n return results.flat().filter((docs) => docs !== null) as Document[];\n }\n\n async load(): Promise<Document[]> {\n const rootDoc = await this.getUrlAsDoc(this.url);\n if (!rootDoc) return [];\n\n const docs = [rootDoc];\n docs.push(\n ...(await this.getChildUrlsRecursive(this.url, new Set([this.url])))\n );\n return docs;\n }\n}\n"],"mappings":";;;;;;;;AASA,MAAM,iBAAiB,IAAIA,MAAAA,gBAAgB;AAC3C,eAAe,GAAG,eAAe,GAAG;AAEpC,MAAM,gBAAgB;AACtB,MAAM,iBAAiB,IAAI,IAAI;CAAC;CAAK;CAAK;CAAK;CAAK;CAAI,CAAC;AAWzD,IAAa,qBAAb,cACUC,sCAAAA,mBAEV;CACE;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,YAAY,KAAa,SAAoC;AAC3D,SAAO;AAEP,OAAK,SAAS,IAAIC,mCAAAA,YAAY;GAC5B,gBAAgB;GAChB,YAAY;GACZ,GAAG,QAAQ;GACZ,CAAC;AAEF,OAAK,MAAM;AACX,OAAK,cAAc,QAAQ,eAAe,EAAE;AAC5C,OAAK,YAAY,QAAQ,eAAe,MAAc;AACtD,OAAK,WAAW,QAAQ,YAAY;AACpC,OAAK,UAAU,QAAQ,WAAW;AAClC,OAAK,iBAAiB,QAAQ,kBAAkB;;CAGlD,MAAc,iBACZ,UACA,SACmB;EACnB,MAAM,EAAE,SAAS,GAAG,SAAS;EAC7B,IAAI,aAAa;AAEjB,OAAK,IAAI,IAAI,GAAG,KAAK,eAAe,KAAK;AACvC,IAAA,GAAA,2BAAA,iBAAgB,YAAY,EAAE,WAAW,MAAM,CAAC;GAEhD,MAAM,WAAW,MAAM,KAAK,OAAO,WACjC,MAAM,YAAY;IAChB,GAAG;IACH,UAAU;IACV,QAAQ,YAAY,QAAQ,QAAQ;IACrC,CAAC,CACH;AAED,OAAI,eAAe,IAAI,SAAS,OAAO,EAAE;IACvC,MAAM,WAAW,SAAS,QAAQ,IAAI,WAAW;AACjD,QAAI,CAAC,SACH,OAAM,IAAI,MAAM,4CAA4C;AAE9D,iBAAa,IAAI,IAAI,UAAU,WAAW,CAAC;AAC3C;;AAGF,UAAO;;AAGT,QAAM,IAAI,MAAM,2BAA2B,cAAc,GAAG;;CAG9D,cAAsB,MAAc,SAAgC;EAClE,MAAM,WAAW,MAAM,KACrB,IAAIC,MAAAA,MAAM,MAAM,EAAE,gBAAgB,CAAC,CAAC,OAAO,SAAS,iBAAiB,IAAI,CAC1E,CAAC,KAAK,MAAM,EAAE,KAAK;EACpB,MAAM,gBAAgB,EAAE;EACxB,MAAM,kBAAkB;GAAC;GAAe;GAAW;GAAI;EACvD,MAAM,kBAAkB;GACtB;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACD;AAED,OAAK,MAAM,QAAQ,UAAU;AAC3B,OACE,gBAAgB,MAAM,WAAW,KAAK,WAAW,OAAO,CAAC,IACzD,gBAAgB,MAAM,WAAW,KAAK,SAAS,OAAO,CAAC,CAEvD;GAEF,IAAI;AAEJ,OAAI,KAAK,WAAW,OAAO,CACzB,oBAAmB;YACV,KAAK,WAAW,KAAK,CAE9B,oBADa,IAAI,IAAI,QAAQ,CACL,WAAW;OAEnC,oBAAmB,IAAI,IAAI,MAAM,QAAQ,CAAC;AAG5C,OAAI,KAAK,YAAY,MAAM,UAAU,iBAAiB,WAAW,MAAM,CAAC,CACtE;AAEF,OAAI,KAAK,WAAW,OAAO;QACP,CAAC,KAAK,mBAAA,GAAA,2BAAA,cAA+B,MAAM,QAAQ,CACtD,eAAc,KAAK,KAAK;cAC9B,KAAK,WAAW,KAAK,EAAE;IAChC,MAAM,OAAO,IAAI,IAAI,QAAQ;AAC7B,kBAAc,KAAK,KAAK,WAAW,KAAK;UACnC;IACL,MAAM,UAAU,IAAI,IAAI,MAAM,QAAQ,CAAC;AACvC,kBAAc,KAAK,QAAQ;;;AAI/B,SAAO,MAAM,KAAK,IAAI,IAAI,cAAc,CAAC;;CAG3C,gBAAwB,SAAiB,KAAa;EAEpD,MAAM,WAAgC,EAAE,QAAQ,KAAK;EACrD,MAAM,EAAE,aAAa,IAAIA,MAAAA,MAAM,SAAS,EAAE,gBAAgB,CAAC,CAAC;EAE5D,MAAM,QAAQ,SAAS,qBAAqB,QAAQ,CAAC;AACrD,MAAI,MACF,UAAS,QAAQ,MAAM;EAGzB,MAAM,cAAc,SAAS,cAAc,yBAAyB;AACpE,MAAI,YACF,UAAS,cAAc,YAAY,aAAa,UAAU;EAG5D,MAAM,OAAO,SAAS,qBAAqB,OAAO,CAAC;AACnD,MAAI,KACF,UAAS,WAAW,KAAK,aAAa,OAAO;AAG/C,SAAO;;CAGT,MAAc,YAAY,KAAuC;EAC/D,IAAI;AACJ,MAAI;AACF,SAAM,MAAM,KAAK,iBAAiB,KAAK,EAAE,SAAS,KAAK,SAAS,CAAC;AACjE,SAAM,MAAM,IAAI,MAAM;UAChB;AACN,UAAO;;AAGT,SAAO;GACL,aAAa,KAAK,UAAU,IAAI;GAChC,UAAU,KAAK,gBAAgB,KAAK,IAAI;GACzC;;CAGH,MAAc,sBACZ,UACA,0BAAuB,IAAI,KAAa,EACxC,QAAQ,GACa;AACrB,MAAI,SAAS,KAAK,SAAU,QAAO,EAAE;EAErC,IAAI,MAAM;AACV,MAAI,CAAC,SAAS,SAAS,IAAI,CAAE,QAAO;AAGpC,MADmB,KAAK,YAAY,MAAM,UAAU,IAAI,WAAW,MAAM,CAAC,CAC1D,QAAO,EAAE;EAEzB,IAAI;AACJ,MAAI;AACF,SAAM,MAAM,KAAK,iBAAiB,KAAK,EAAE,SAAS,KAAK,SAAS,CAAC;AACjE,SAAM,MAAM,IAAI,MAAM;UAChB;AACN,UAAO,EAAE;;EAGX,MAAM,YAAsB,KAAK,cAAc,KAAK,IAAI;AAyBxD,UAvBgB,MAAM,QAAQ,IAC5B,UAAU,KAAK,cACZ,YAAY;AACX,OAAI,QAAQ,IAAI,SAAS,CAAE,QAAO;AAClC,WAAQ,IAAI,SAAS;GAErB,MAAM,WAAW,MAAM,KAAK,YAAY,SAAS;AACjD,OAAI,CAAC,SAAU,QAAO;AAEtB,OAAI,SAAS,SAAS,IAAI,CAMxB,QAAO,CAAC,UAAU,GALQ,MAAM,KAAK,sBACnC,UACA,SACA,QAAQ,EACT,CACsC;AAGzC,UAAO,CAAC,SAAS;MACf,CACL,CACF,EAEc,MAAM,CAAC,QAAQ,SAAS,SAAS,KAAK;;CAGvD,MAAM,OAA4B;EAChC,MAAM,UAAU,MAAM,KAAK,YAAY,KAAK,IAAI;AAChD,MAAI,CAAC,QAAS,QAAO,EAAE;EAEvB,MAAM,OAAO,CAAC,QAAQ;AACtB,OAAK,KACH,GAAI,MAAM,KAAK,sBAAsB,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,IAAI,CAAC,CAAC,CACpE;AACD,SAAO"}