@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 5.34 kB
Source Map (JSON)
{"version":3,"file":"gitbook.cjs","names":["CheerioWebBaseLoader","Document"],"sources":["../../../src/document_loaders/web/gitbook.ts"],"sourcesContent":["import type { CheerioAPI } from \"cheerio\";\nimport { Document } from \"@langchain/core/documents\";\nimport { CheerioWebBaseLoader } from \"./cheerio.js\";\n\n/**\n * Interface representing the parameters for configuring the\n * GitbookLoader. It has an optional property shouldLoadAllPaths, which\n * indicates whether all paths should be loaded.\n */\ninterface GitbookLoaderParams {\n shouldLoadAllPaths?: boolean;\n}\n\n/**\n * Class representing a document loader specifically designed for loading\n * documents from Gitbook. It extends the CheerioWebBaseLoader.\n */\nexport class GitbookLoader extends CheerioWebBaseLoader {\n shouldLoadAllPaths = false;\n\n private readonly baseUrl: string;\n\n constructor(\n public webPath: string,\n params: GitbookLoaderParams = {}\n ) {\n const path =\n params.shouldLoadAllPaths === true ? `${webPath}/sitemap.xml` : webPath;\n super(path);\n\n this.baseUrl = webPath;\n this.webPath = path;\n\n this.shouldLoadAllPaths =\n params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;\n }\n\n /**\n * Method that scrapes the web document using Cheerio and loads the\n * content based on the value of shouldLoadAllPaths. If shouldLoadAllPaths\n * is true, it calls the loadAllPaths() method to load all paths.\n * Otherwise, it calls the loadPath() method to load a single path.\n * @returns Promise resolving to an array of Document instances.\n */\n public async load(): Promise<Document[]> {\n const $ = await this.scrape();\n\n if (this.shouldLoadAllPaths === true) {\n return this.loadAllPaths($);\n }\n return this.loadPath($);\n }\n\n /**\n * Private method that loads the content of a single path from the Gitbook\n * web document. It extracts the page content by selecting all elements\n * inside the \"main\" element, filters out empty text nodes, and joins the\n * remaining text nodes with line breaks. It extracts the title by\n * selecting the first \"h1\" element inside the \"main\" element. It creates\n * a Document instance with the extracted page content and metadata\n * containing the source URL and title.\n * @param $ CheerioAPI instance representing the loaded web document.\n * @param url Optional string representing the URL of the web document.\n * @returns Array of Document instances.\n */\n private loadPath($: CheerioAPI, url?: string): Document[] {\n const pageContent = $(\"main *\")\n .contents()\n .toArray()\n .map((element) =>\n element.type === \"text\" ? $(element).text().trim() : null\n )\n .filter((text) => text)\n .join(\"\\n\");\n\n const title = $(\"main h1\").first().text().trim();\n\n return [\n new Document({\n pageContent,\n metadata: { source: url ?? this.webPath, title },\n }),\n ];\n }\n\n /**\n * Private method that loads the content of all paths from the Gitbook web\n * document. It extracts the URLs of all paths from the \"loc\" elements in\n * the sitemap.xml. It iterates over each URL, scrapes the web document\n * using the _scrape() method, and calls the loadPath() method to load the\n * content of each path. It collects all the loaded documents and returns\n * them as an array.\n * @param $ CheerioAPI instance representing the loaded web document.\n * @returns Promise resolving to an array of Document instances.\n */\n private async loadAllPaths($: CheerioAPI): Promise<Document[]> {\n const urls = $(\"loc\")\n .toArray()\n .map((element) => $(element).text());\n\n const documents: Document[] = [];\n for (const url of urls) {\n const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;\n console.log(`Fetching text from ${buildUrl}`);\n const html = await GitbookLoader._scrape(\n buildUrl,\n this.caller,\n this.timeout\n );\n documents.push(...this.loadPath(html, buildUrl));\n }\n console.log(`Fetched ${documents.length} documents.`);\n return documents;\n }\n}\n"],"mappings":";;;;;;;;;;AAiBA,IAAa,gBAAb,MAAa,sBAAsBA,qCAAAA,qBAAqB;CACtD,qBAAqB;CAErB;CAEA,YACE,SACA,SAA8B,EAAE,EAChC;EACA,MAAM,OACJ,OAAO,uBAAuB,OAAO,GAAG,QAAQ,gBAAgB;AAClE,QAAM,KAAK;AALJ,OAAA,UAAA;AAOP,OAAK,UAAU;AACf,OAAK,UAAU;AAEf,OAAK,qBACH,OAAO,sBAAsB,KAAK;;;;;;;;;CAUtC,MAAa,OAA4B;EACvC,MAAM,IAAI,MAAM,KAAK,QAAQ;AAE7B,MAAI,KAAK,uBAAuB,KAC9B,QAAO,KAAK,aAAa,EAAE;AAE7B,SAAO,KAAK,SAAS,EAAE;;;;;;;;;;;;;;CAezB,SAAiB,GAAe,KAA0B;EACxD,MAAM,cAAc,EAAE,SAAS,CAC5B,UAAU,CACV,SAAS,CACT,KAAK,YACJ,QAAQ,SAAS,SAAS,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,GAAG,KACtD,CACA,QAAQ,SAAS,KAAK,CACtB,KAAK,KAAK;EAEb,MAAM,QAAQ,EAAE,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM;AAEhD,SAAO,CACL,IAAIC,0BAAAA,SAAS;GACX;GACA,UAAU;IAAE,QAAQ,OAAO,KAAK;IAAS;IAAO;GACjD,CAAC,CACH;;;;;;;;;;;;CAaH,MAAc,aAAa,GAAoC;EAC7D,MAAM,OAAO,EAAE,MAAM,CAClB,SAAS,CACT,KAAK,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC;EAEtC,MAAM,YAAwB,EAAE;AAChC,OAAK,MAAM,OAAO,MAAM;GACtB,MAAM,WAAW,IAAI,SAAS,KAAK,QAAQ,GAAG,MAAM,KAAK,UAAU;AACnE,WAAQ,IAAI,sBAAsB,WAAW;GAC7C,MAAM,OAAO,MAAM,cAAc,QAC/B,UACA,KAAK,QACL,KAAK,QACN;AACD,aAAU,KAAK,GAAG,KAAK,SAAS,MAAM,SAAS,CAAC;;AAElD,UAAQ,IAAI,WAAW,UAAU,OAAO,aAAa;AACrD,SAAO"}