UNPKG

@langchain/community

Version:
1 lines 6.12 kB
{"version":3,"file":"cheerio.cjs","names":["BaseDocumentLoader","AsyncCaller","Document"],"sources":["../../../src/document_loaders/web/cheerio.ts"],"sourcesContent":["import type {\n CheerioAPI,\n CheerioOptions,\n load as LoadT,\n SelectorType,\n} from \"cheerio\";\nimport { Document } from \"@langchain/core/documents\";\nimport { AsyncCaller } from \"@langchain/core/utils/async_caller\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\nimport type { WebBaseLoaderParams, WebBaseLoader } from \"./html.js\";\n\n/**\n * Represents the parameters for configuring the CheerioWebBaseLoader. It\n * extends the WebBaseLoaderParams interface and adds additional parameters\n * specific to loading with Cheerio.\n */\nexport interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams {\n /**\n * The selector to use to extract the text from the document. Defaults to\n * \"body\".\n */\n selector?: SelectorType;\n}\n\n/**\n * A class that extends the BaseDocumentLoader and implements the\n * DocumentLoader interface. It represents a document loader for loading\n * web-based documents using Cheerio.\n * @example\n * ```typescript\n * const loader = new CheerioWebBaseLoader(\"https://exampleurl.com\");\n * const docs = await loader.load();\n * console.log({ docs });\n * ```\n */\nexport class CheerioWebBaseLoader\n extends BaseDocumentLoader\n implements WebBaseLoader\n{\n timeout: number;\n\n caller: AsyncCaller;\n\n selector?: SelectorType;\n\n textDecoder?: TextDecoder;\n\n headers?: HeadersInit;\n\n constructor(\n public webPath: string,\n fields?: CheerioWebBaseLoaderParams\n ) {\n super();\n const { timeout, selector, textDecoder, headers, ...rest } = fields ?? {};\n this.timeout = timeout ?? 10000;\n this.caller = new AsyncCaller(rest);\n this.selector = selector ?? \"body\";\n this.textDecoder = textDecoder;\n this.headers = headers;\n }\n\n /**\n * Fetches web documents from the given array of URLs and loads them using Cheerio.\n * It returns an array of CheerioAPI instances.\n * @param urls An array of URLs to fetch and load.\n * @returns A Promise that resolves to an array of CheerioAPI instances.\n */\n static async scrapeAll(\n urls: string[],\n caller: AsyncCaller,\n timeout: number | undefined,\n textDecoder?: TextDecoder,\n options?: CheerioOptions & {\n headers?: HeadersInit;\n }\n ): Promise<CheerioAPI[]> {\n return Promise.all(\n urls.map((url) =>\n CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)\n )\n );\n }\n\n static async _scrape(\n url: string,\n caller: AsyncCaller,\n timeout: number | undefined,\n textDecoder?: TextDecoder,\n options?: CheerioOptions & {\n headers?: HeadersInit;\n }\n ): Promise<CheerioAPI> {\n const { headers, ...cheerioOptions } = options ?? {};\n const { load } = await CheerioWebBaseLoader.imports();\n const response = await caller.call(fetch, url, {\n signal: timeout ? AbortSignal.timeout(timeout) : undefined,\n headers,\n });\n const html =\n textDecoder?.decode(await response.arrayBuffer()) ??\n (await response.text());\n return load(html, cheerioOptions);\n }\n\n /**\n * Fetches the web document from the webPath and loads it using Cheerio.\n * It returns a CheerioAPI instance.\n * @returns A Promise that resolves to a CheerioAPI instance.\n */\n async scrape(): Promise<CheerioAPI> {\n const options = { headers: this.headers };\n return CheerioWebBaseLoader._scrape(\n this.webPath,\n this.caller,\n this.timeout,\n this.textDecoder,\n options\n );\n }\n\n /**\n * Extracts the text content from the loaded document using the selector\n * and creates a Document instance with the extracted text and metadata.\n * It returns an array of Document instances.\n * @returns A Promise that resolves to an array of Document instances.\n */\n async load(): Promise<Document[]> {\n const $ = await this.scrape();\n const title = $(\"title\").text();\n const text = $(this.selector).text();\n const metadata = { source: this.webPath, title };\n return [new Document({ pageContent: text, metadata })];\n }\n\n /**\n * A static method that dynamically imports the Cheerio library and\n * returns the load function. If the import fails, it throws an error.\n * @returns A Promise that resolves to an object containing the load function from the Cheerio library.\n */\n static async imports(): Promise<{\n load: typeof LoadT;\n }> {\n try {\n const { load } = await import(\"cheerio\");\n return { load };\n } catch (e) {\n console.error(e);\n throw new Error(\n \"Please install cheerio as a dependency with, e.g. `pnpm install cheerio`\"\n );\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAmCA,IAAa,uBAAb,MAAa,6BACHA,sCAAAA,mBAEV;CACE;CAEA;CAEA;CAEA;CAEA;CAEA,YACE,SACA,QACA;AACA,SAAO;AAHA,OAAA,UAAA;EAIP,MAAM,EAAE,SAAS,UAAU,aAAa,SAAS,GAAG,SAAS,UAAU,EAAE;AACzE,OAAK,UAAU,WAAW;AAC1B,OAAK,SAAS,IAAIC,mCAAAA,YAAY,KAAK;AACnC,OAAK,WAAW,YAAY;AAC5B,OAAK,cAAc;AACnB,OAAK,UAAU;;;;;;;;CASjB,aAAa,UACX,MACA,QACA,SACA,aACA,SAGuB;AACvB,SAAO,QAAQ,IACb,KAAK,KAAK,QACR,qBAAqB,QAAQ,KAAK,QAAQ,SAAS,aAAa,QAAQ,CACzE,CACF;;CAGH,aAAa,QACX,KACA,QACA,SACA,aACA,SAGqB;EACrB,MAAM,EAAE,SAAS,GAAG,mBAAmB,WAAW,EAAE;EACpD,MAAM,EAAE,SAAS,MAAM,qBAAqB,SAAS;EACrD,MAAM,WAAW,MAAM,OAAO,KAAK,OAAO,KAAK;GAC7C,QAAQ,UAAU,YAAY,QAAQ,QAAQ,GAAG,KAAA;GACjD;GACD,CAAC;AAIF,SAAO,KAFL,aAAa,OAAO,MAAM,SAAS,aAAa,CAAC,IAChD,MAAM,SAAS,MAAM,EACN,eAAe;;;;;;;CAQnC,MAAM,SAA8B;EAClC,MAAM,UAAU,EAAE,SAAS,KAAK,SAAS;AACzC,SAAO,qBAAqB,QAC1B,KAAK,SACL,KAAK,QACL,KAAK,SACL,KAAK,aACL,QACD;;;;;;;;CASH,MAAM,OAA4B;EAChC,MAAM,IAAI,MAAM,KAAK,QAAQ;EAC7B,MAAM,QAAQ,EAAE,QAAQ,CAAC,MAAM;AAG/B,SAAO,CAAC,IAAIC,0BAAAA,SAAS;GAAE,aAFV,EAAE,KAAK,SAAS,CAAC,MAAM;GAEM,UADzB;IAAE,QAAQ,KAAK;IAAS;IAAO;GACI,CAAC,CAAC;;;;;;;CAQxD,aAAa,UAEV;AACD,MAAI;GACF,MAAM,EAAE,SAAS,MAAM,OAAO;AAC9B,UAAO,EAAE,MAAM;WACR,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,2EACD"}