UNPKG

@langchain/community

Version:
136 lines (135 loc) 4.75 kB
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" }); const require_runtime = require("../../_virtual/_rolldown/runtime.cjs"); let _langchain_core_documents = require("@langchain/core/documents"); let _langchain_core_document_loaders_base = require("@langchain/core/document_loaders/base"); //#region src/document_loaders/web/puppeteer.ts var puppeteer_exports = /* @__PURE__ */ require_runtime.__exportAll({ PuppeteerWebBaseLoader: () => PuppeteerWebBaseLoader }); /** * Class that extends the BaseDocumentLoader class and implements the * DocumentLoader interface. It represents a document loader for scraping * web pages using Puppeteer. * @example * ```typescript * const loader = new PuppeteerWebBaseLoader("https:exampleurl.com", { * launchOptions: { * headless: true, * }, * gotoOptions: { * waitUntil: "domcontentloaded", * }, * }); * const screenshot = await loader.screenshot(); * ``` */ var PuppeteerWebBaseLoader = class PuppeteerWebBaseLoader extends _langchain_core_document_loaders_base.BaseDocumentLoader { options; constructor(webPath, options) { super(); this.webPath = webPath; this.options = options ?? void 0; } static async _scrape(url, options) { const { launch, connect } = await PuppeteerWebBaseLoader.imports(); let browser; if (options?.launchOptions?.browserWSEndpoint) browser = await connect({ browserWSEndpoint: options?.launchOptions?.browserWSEndpoint }); else browser = await launch({ headless: true, defaultViewport: null, ignoreDefaultArgs: ["--disable-extensions"], ...options?.launchOptions }); const page = await browser.newPage(); await page.goto(url, { timeout: 18e4, waitUntil: "domcontentloaded", ...options?.gotoOptions }); const bodyHTML = options?.evaluate ? await options?.evaluate(page, browser) : await page.evaluate(() => document.body.innerHTML); await browser.close(); return bodyHTML; } /** * Method that calls the _scrape method to perform the scraping of the web * page specified by the webPath property. * @returns Promise that resolves to the scraped HTML content of the web page. */ async scrape() { return PuppeteerWebBaseLoader._scrape(this.webPath, this.options); } /** * Method that calls the scrape method and returns the scraped HTML * content as a Document object. * @returns Promise that resolves to an array of Document objects. */ async load() { return [new _langchain_core_documents.Document({ pageContent: await this.scrape(), metadata: { source: this.webPath } })]; } /** * Static class method used to screenshot a web page and return * it as a {@link Document} object where the pageContent property * is the screenshot encoded in base64. * * @param {string} url * @param {PuppeteerWebBaseLoaderOptions} options * @returns {Document} A document object containing the screenshot of the page encoded in base64. */ static async _screenshot(url, options) { const { launch, connect } = await PuppeteerWebBaseLoader.imports(); let browser; if (options?.launchOptions?.browserWSEndpoint) browser = await connect({ browserWSEndpoint: options?.launchOptions?.browserWSEndpoint }); else browser = await launch({ headless: true, defaultViewport: null, ignoreDefaultArgs: ["--disable-extensions"], ...options?.launchOptions }); const page = await browser.newPage(); await page.goto(url, { timeout: 18e4, waitUntil: "domcontentloaded", ...options?.gotoOptions }); return new _langchain_core_documents.Document({ pageContent: (await page.screenshot()).toString("base64"), metadata: { source: url } }); } /** * Screenshot a web page and return it as a {@link Document} object where * the pageContent property is the screenshot encoded in base64. * * @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64. */ async screenshot() { return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options); } /** * Static method that imports the necessary Puppeteer modules. It returns * a Promise that resolves to an object containing the imported modules. * @returns Promise that resolves to an object containing the imported Puppeteer modules. */ static async imports() { try { const { launch, connect } = await import("puppeteer"); return { launch, connect }; } catch (e) { console.error(e); throw new Error("Please install puppeteer as a dependency with, e.g. `pnpm install puppeteer`"); } } }; //#endregion exports.PuppeteerWebBaseLoader = PuppeteerWebBaseLoader; Object.defineProperty(exports, "puppeteer_exports", { enumerable: true, get: function() { return puppeteer_exports; } }); //# sourceMappingURL=puppeteer.cjs.map