@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 4.33 kB
Source Map (JSON)
{"version":3,"file":"spider.cjs","names":["BaseDocumentLoader","Spider","Document"],"sources":["../../../src/document_loaders/web/spider.ts"],"sourcesContent":["import { Spider } from \"@spider-cloud/spider-client\";\nimport { Document, type DocumentInterface } from \"@langchain/core/documents\";\nimport { getEnvironmentVariable } from \"@langchain/core/utils/env\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\n/**\n * Interface representing the parameters for the Spider loader. It\n * includes properties such as the URL to scrape or crawl and the API key.\n */\ninterface SpiderLoaderParameters {\n /**\n * URL to scrape or crawl\n */\n url: string;\n\n /**\n * API key for Spider. If not provided, the default value is the value of the SPIDER_API_KEY environment variable.\n */\n apiKey?: string;\n\n /**\n * Mode of operation. Can be either \"crawl\" or \"scrape\". If not provided, the default value is \"scrape\".\n */\n mode?: \"crawl\" | \"scrape\";\n params?: Record<string, unknown>;\n}\ninterface SpiderDocument {\n content: string;\n metadata: Record<string, unknown>;\n}\n\n/**\n * Class representing a document loader for loading data from\n * Spider (spider.cloud). It extends the BaseDocumentLoader class.\n * @example\n * ```typescript\n * const loader = new SpiderLoader({\n * url: \"{url}\",\n * apiKey: \"{apiKey}\",\n * mode: \"crawl\"\n * });\n * const docs = await loader.load();\n * ```\n */\nexport class SpiderLoader extends BaseDocumentLoader {\n private apiKey: string;\n\n private url: string;\n\n private mode: \"crawl\" | \"scrape\";\n\n private params?: Record<string, unknown>;\n\n constructor(loaderParams: SpiderLoaderParameters) {\n super();\n const {\n apiKey = getEnvironmentVariable(\"SPIDER_API_KEY\"),\n url,\n mode = \"scrape\",\n params,\n } = loaderParams;\n if (!apiKey) {\n throw new Error(\n \"Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.\"\n );\n }\n\n this.apiKey = apiKey;\n this.url = url;\n this.mode = mode;\n this.params = params || { metadata: true, return_format: \"markdown\" };\n }\n\n /**\n * Loads the data from the Spider.\n * @returns An array of Documents representing the retrieved data.\n * @throws An error if the data could not be loaded.\n */\n public async load(): Promise<DocumentInterface[]> {\n const app = new Spider({ apiKey: this.apiKey });\n let spiderDocs: SpiderDocument[];\n\n if (this.mode === \"scrape\") {\n const response = await app.scrapeUrl(this.url, this.params);\n if (response.error) {\n throw new Error(\n `Spider: Failed to scrape URL. Error: ${response.error}`\n );\n }\n spiderDocs = response as SpiderDocument[];\n } else if (this.mode === \"crawl\") {\n const response = await app.crawlUrl(this.url, this.params);\n if (response.error) {\n throw new Error(\n `Spider: Failed to crawl URL. Error: ${response.error}`\n );\n }\n spiderDocs = response as SpiderDocument[];\n } else {\n throw new Error(\n `Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`\n );\n }\n\n return spiderDocs.map(\n (doc) =>\n new Document({\n pageContent: doc.content || \"\",\n metadata: doc.metadata || {},\n })\n );\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;AA4CA,IAAa,eAAb,cAAkCA,sCAAAA,mBAAmB;CACnD;CAEA;CAEA;CAEA;CAEA,YAAY,cAAsC;AAChD,SAAO;EACP,MAAM,EACJ,UAAA,GAAA,0BAAA,wBAAgC,iBAAiB,EACjD,KACA,OAAO,UACP,WACE;AACJ,MAAI,CAAC,OACH,OAAM,IAAI,MACR,oGACD;AAGH,OAAK,SAAS;AACd,OAAK,MAAM;AACX,OAAK,OAAO;AACZ,OAAK,SAAS,UAAU;GAAE,UAAU;GAAM,eAAe;GAAY;;;;;;;CAQvE,MAAa,OAAqC;EAChD,MAAM,MAAM,IAAIC,4BAAAA,OAAO,EAAE,QAAQ,KAAK,QAAQ,CAAC;EAC/C,IAAI;AAEJ,MAAI,KAAK,SAAS,UAAU;GAC1B,MAAM,WAAW,MAAM,IAAI,UAAU,KAAK,KAAK,KAAK,OAAO;AAC3D,OAAI,SAAS,MACX,OAAM,IAAI,MACR,wCAAwC,SAAS,QAClD;AAEH,gBAAa;aACJ,KAAK,SAAS,SAAS;GAChC,MAAM,WAAW,MAAM,IAAI,SAAS,KAAK,KAAK,KAAK,OAAO;AAC1D,OAAI,SAAS,MACX,OAAM,IAAI,MACR,uCAAuC,SAAS,QACjD;AAEH,gBAAa;QAEb,OAAM,IAAI,MACR,sBAAsB,KAAK,KAAK,uCACjC;AAGH,SAAO,WAAW,KACf,QACC,IAAIC,0BAAAA,SAAS;GACX,aAAa,IAAI,WAAW;GAC5B,UAAU,IAAI,YAAY,EAAE;GAC7B,CAAC,CACL"}