@langchain/community
Version:
Third-party integrations for LangChain.js
103 lines (102 loc) • 3.9 kB
JavaScript
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
const require_runtime = require("../../_virtual/_rolldown/runtime.cjs");
let _langchain_core_utils_async_caller = require("@langchain/core/utils/async_caller");
let _langchain_core_documents = require("@langchain/core/documents");
let _langchain_core_document_loaders_base = require("@langchain/core/document_loaders/base");
//#region src/document_loaders/web/cheerio.ts
var cheerio_exports = /* @__PURE__ */ require_runtime.__exportAll({ CheerioWebBaseLoader: () => CheerioWebBaseLoader });
/**
* A class that extends the BaseDocumentLoader and implements the
* DocumentLoader interface. It represents a document loader for loading
* web-based documents using Cheerio.
* @example
* ```typescript
* const loader = new CheerioWebBaseLoader("https://exampleurl.com");
* const docs = await loader.load();
* console.log({ docs });
* ```
*/
var CheerioWebBaseLoader = class CheerioWebBaseLoader extends _langchain_core_document_loaders_base.BaseDocumentLoader {
timeout;
caller;
selector;
textDecoder;
headers;
constructor(webPath, fields) {
super();
this.webPath = webPath;
const { timeout, selector, textDecoder, headers, ...rest } = fields ?? {};
this.timeout = timeout ?? 1e4;
this.caller = new _langchain_core_utils_async_caller.AsyncCaller(rest);
this.selector = selector ?? "body";
this.textDecoder = textDecoder;
this.headers = headers;
}
/**
* Fetches web documents from the given array of URLs and loads them using Cheerio.
* It returns an array of CheerioAPI instances.
* @param urls An array of URLs to fetch and load.
* @returns A Promise that resolves to an array of CheerioAPI instances.
*/
static async scrapeAll(urls, caller, timeout, textDecoder, options) {
return Promise.all(urls.map((url) => CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)));
}
static async _scrape(url, caller, timeout, textDecoder, options) {
const { headers, ...cheerioOptions } = options ?? {};
const { load } = await CheerioWebBaseLoader.imports();
const response = await caller.call(fetch, url, {
signal: timeout ? AbortSignal.timeout(timeout) : void 0,
headers
});
return load(textDecoder?.decode(await response.arrayBuffer()) ?? await response.text(), cheerioOptions);
}
/**
* Fetches the web document from the webPath and loads it using Cheerio.
* It returns a CheerioAPI instance.
* @returns A Promise that resolves to a CheerioAPI instance.
*/
async scrape() {
const options = { headers: this.headers };
return CheerioWebBaseLoader._scrape(this.webPath, this.caller, this.timeout, this.textDecoder, options);
}
/**
* Extracts the text content from the loaded document using the selector
* and creates a Document instance with the extracted text and metadata.
* It returns an array of Document instances.
* @returns A Promise that resolves to an array of Document instances.
*/
async load() {
const $ = await this.scrape();
const title = $("title").text();
return [new _langchain_core_documents.Document({
pageContent: $(this.selector).text(),
metadata: {
source: this.webPath,
title
}
})];
}
/**
* A static method that dynamically imports the Cheerio library and
* returns the load function. If the import fails, it throws an error.
* @returns A Promise that resolves to an object containing the load function from the Cheerio library.
*/
static async imports() {
try {
const { load } = await import("cheerio");
return { load };
} catch (e) {
console.error(e);
throw new Error("Please install cheerio as a dependency with, e.g. `pnpm install cheerio`");
}
}
};
//#endregion
exports.CheerioWebBaseLoader = CheerioWebBaseLoader;
Object.defineProperty(exports, "cheerio_exports", {
enumerable: true,
get: function() {
return cheerio_exports;
}
});
//# sourceMappingURL=cheerio.cjs.map