UNPKG

@langchain/community

Version:
96 lines (95 loc) 3.16 kB
import { __exportAll } from "../../_virtual/_rolldown/runtime.js"; import { CheerioWebBaseLoader } from "./cheerio.js"; import { Document } from "@langchain/core/documents"; import { chunkArray } from "@langchain/core/utils/chunk_array"; //#region src/document_loaders/web/sitemap.ts var sitemap_exports = /* @__PURE__ */ __exportAll({ SitemapLoader: () => SitemapLoader }); const DEFAULT_CHUNK_SIZE = 300; var SitemapLoader = class extends CheerioWebBaseLoader { allowUrlPatterns; chunkSize; constructor(webPath, params = {}) { const paramsWithDefaults = { chunkSize: DEFAULT_CHUNK_SIZE, ...params }; let path = webPath.endsWith("/") ? webPath.slice(0, -1) : webPath; path = path.endsWith(".xml") ? path : `${path}/sitemap.xml`; super(path, paramsWithDefaults); this.webPath = webPath; this.webPath = path; this.allowUrlPatterns = paramsWithDefaults.filterUrls; this.chunkSize = paramsWithDefaults.chunkSize; } _checkUrlPatterns(url) { if (!this.allowUrlPatterns) return false; return !this.allowUrlPatterns.some((pattern) => !new RegExp(pattern).test(url)); } async parseSitemap() { const $ = await CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, { xmlMode: true, xml: true }); const elements = []; $("url").each((_, element) => { const loc = $(element).find("loc").text(); if (!loc) return; if (this._checkUrlPatterns(loc)) return; const changefreq = $(element).find("changefreq").text(); const lastmod = $(element).find("lastmod").text(); const priority = $(element).find("priority").text(); elements.push({ loc, changefreq, lastmod, priority }); }); $("sitemap").each((_, element) => { const loc = $(element).find("loc").text(); if (!loc) return; const changefreq = $(element).find("changefreq").text(); const lastmod = $(element).find("lastmod").text(); const priority = $(element).find("priority").text(); elements.push({ loc, changefreq, lastmod, priority }); }); return elements; } async _loadSitemapUrls(elements) { return (await CheerioWebBaseLoader.scrapeAll(elements.map((ele) => ele.loc), this.caller, this.timeout, this.textDecoder)).map(($, i) => { if (!elements[i]) throw new Error("Scraped docs and elements not in sync"); const text = $(this.selector).text(); const { loc: source, ...metadata } = elements[i]; const description = $("meta[name='description']").attr("content"); const title = $("meta[property='og:title']").attr("content"); const lang = $("meta[property='og:locale']").attr("content"); return new Document({ pageContent: text, metadata: { ...metadata, description, title, lang, source: source.trim() } }); }); } async load() { const chunks = chunkArray(await this.parseSitemap(), this.chunkSize); const documents = []; for await (const chunk of chunks) { const chunkedDocuments = await this._loadSitemapUrls(chunk); documents.push(...chunkedDocuments); } return documents; } }; //#endregion export { SitemapLoader, sitemap_exports }; //# sourceMappingURL=sitemap.js.map