UNPKG

@langchain/community

Version:
1 lines 7.6 kB
{"version":3,"file":"puppeteer.cjs","names":["BaseDocumentLoader","Document"],"sources":["../../../src/document_loaders/web/puppeteer.ts"],"sourcesContent":["import type {\n launch,\n WaitForOptions,\n Page,\n Browser,\n PuppeteerLaunchOptions,\n connect,\n ConnectOptions,\n} from \"puppeteer\";\n\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\nimport type { DocumentLoader } from \"@langchain/core/document_loaders/base\";\n\nexport { Page, Browser };\n\nexport type PuppeteerGotoOptions = WaitForOptions & {\n referer?: string;\n referrerPolicy?: string;\n};\n\n/**\n * Type representing a function for evaluating JavaScript code on a web\n * page using Puppeteer. It takes a Page and Browser object as parameters\n * and returns a Promise that resolves to a string.\n */\nexport type PuppeteerEvaluate = (\n page: Page,\n browser: Browser\n) => Promise<string>;\n\nexport type PuppeteerWebBaseLoaderOptions = {\n launchOptions?: PuppeteerLaunchOptions & ConnectOptions;\n gotoOptions?: PuppeteerGotoOptions;\n evaluate?: PuppeteerEvaluate;\n};\n\n/**\n * Class that extends the BaseDocumentLoader class and implements the\n * DocumentLoader interface. It represents a document loader for scraping\n * web pages using Puppeteer.\n * @example\n * ```typescript\n * const loader = new PuppeteerWebBaseLoader(\"https:exampleurl.com\", {\n * launchOptions: {\n * headless: true,\n * },\n * gotoOptions: {\n * waitUntil: \"domcontentloaded\",\n * },\n * });\n * const screenshot = await loader.screenshot();\n * ```\n */\nexport class PuppeteerWebBaseLoader\n extends BaseDocumentLoader\n implements DocumentLoader\n{\n options: PuppeteerWebBaseLoaderOptions | undefined;\n\n constructor(\n public webPath: string,\n options?: PuppeteerWebBaseLoaderOptions\n ) {\n super();\n this.options = options ?? undefined;\n }\n\n static async _scrape(\n url: string,\n options?: PuppeteerWebBaseLoaderOptions\n ): Promise<string> {\n const { launch, connect } = await PuppeteerWebBaseLoader.imports();\n\n let browser: Browser;\n\n if (options?.launchOptions?.browserWSEndpoint) {\n browser = await connect({\n browserWSEndpoint: options?.launchOptions?.browserWSEndpoint,\n });\n } else {\n browser = await launch({\n headless: true,\n defaultViewport: null,\n ignoreDefaultArgs: [\"--disable-extensions\"],\n ...options?.launchOptions,\n });\n }\n const page = await browser.newPage();\n\n await page.goto(url, {\n timeout: 180000,\n waitUntil: \"domcontentloaded\",\n ...options?.gotoOptions,\n });\n const bodyHTML = options?.evaluate\n ? await options?.evaluate(page, browser)\n : await page.evaluate(() => document.body.innerHTML);\n\n await browser.close();\n\n return bodyHTML;\n }\n\n /**\n * Method that calls the _scrape method to perform the scraping of the web\n * page specified by the webPath property.\n * @returns Promise that resolves to the scraped HTML content of the web page.\n */\n async scrape(): Promise<string> {\n return PuppeteerWebBaseLoader._scrape(this.webPath, this.options);\n }\n\n /**\n * Method that calls the scrape method and returns the scraped HTML\n * content as a Document object.\n * @returns Promise that resolves to an array of Document objects.\n */\n async load(): Promise<Document[]> {\n const text = await this.scrape();\n\n const metadata = { source: this.webPath };\n return [new Document({ pageContent: text, metadata })];\n }\n\n /**\n * Static class method used to screenshot a web page and return\n * it as a {@link Document} object where the pageContent property\n * is the screenshot encoded in base64.\n *\n * @param {string} url\n * @param {PuppeteerWebBaseLoaderOptions} options\n * @returns {Document} A document object containing the screenshot of the page encoded in base64.\n */\n static async _screenshot(\n url: string,\n options?: PuppeteerWebBaseLoaderOptions\n ): Promise<Document> {\n const { launch, connect } = await PuppeteerWebBaseLoader.imports();\n\n let browser: Browser;\n if (options?.launchOptions?.browserWSEndpoint) {\n browser = await connect({\n browserWSEndpoint: options?.launchOptions?.browserWSEndpoint,\n });\n } else {\n browser = await launch({\n headless: true,\n defaultViewport: null,\n ignoreDefaultArgs: [\"--disable-extensions\"],\n ...options?.launchOptions,\n });\n }\n const page = await browser.newPage();\n\n await page.goto(url, {\n timeout: 180000,\n waitUntil: \"domcontentloaded\",\n ...options?.gotoOptions,\n });\n const screenshot = await page.screenshot();\n const base64 = screenshot.toString(\"base64\");\n const metadata = { source: url };\n return new Document({ pageContent: base64, metadata });\n }\n\n /**\n * Screenshot a web page and return it as a {@link Document} object where\n * the pageContent property is the screenshot encoded in base64.\n *\n * @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.\n */\n async screenshot(): Promise<Document> {\n return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);\n }\n\n /**\n * Static method that imports the necessary Puppeteer modules. It returns\n * a Promise that resolves to an object containing the imported modules.\n * @returns Promise that resolves to an object containing the imported Puppeteer modules.\n */\n static async imports(): Promise<{\n launch: typeof launch;\n connect: typeof connect;\n }> {\n try {\n const { launch, connect } = await import(\"puppeteer\");\n\n return { launch, connect };\n } catch (e) {\n console.error(e);\n throw new Error(\n \"Please install puppeteer as a dependency with, e.g. `pnpm install puppeteer`\"\n );\n }\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;AAsDA,IAAa,yBAAb,MAAa,+BACHA,sCAAAA,mBAEV;CACE;CAEA,YACE,SACA,SACA;AACA,SAAO;AAHA,OAAA,UAAA;AAIP,OAAK,UAAU,WAAW,KAAA;;CAG5B,aAAa,QACX,KACA,SACiB;EACjB,MAAM,EAAE,QAAQ,YAAY,MAAM,uBAAuB,SAAS;EAElE,IAAI;AAEJ,MAAI,SAAS,eAAe,kBAC1B,WAAU,MAAM,QAAQ,EACtB,mBAAmB,SAAS,eAAe,mBAC5C,CAAC;MAEF,WAAU,MAAM,OAAO;GACrB,UAAU;GACV,iBAAiB;GACjB,mBAAmB,CAAC,uBAAuB;GAC3C,GAAG,SAAS;GACb,CAAC;EAEJ,MAAM,OAAO,MAAM,QAAQ,SAAS;AAEpC,QAAM,KAAK,KAAK,KAAK;GACnB,SAAS;GACT,WAAW;GACX,GAAG,SAAS;GACb,CAAC;EACF,MAAM,WAAW,SAAS,WACtB,MAAM,SAAS,SAAS,MAAM,QAAQ,GACtC,MAAM,KAAK,eAAe,SAAS,KAAK,UAAU;AAEtD,QAAM,QAAQ,OAAO;AAErB,SAAO;;;;;;;CAQT,MAAM,SAA0B;AAC9B,SAAO,uBAAuB,QAAQ,KAAK,SAAS,KAAK,QAAQ;;;;;;;CAQnE,MAAM,OAA4B;AAIhC,SAAO,CAAC,IAAIC,0BAAAA,SAAS;GAAE,aAHV,MAAM,KAAK,QAAQ;GAGU,UADzB,EAAE,QAAQ,KAAK,SAAS;GACW,CAAC,CAAC;;;;;;;;;;;CAYxD,aAAa,YACX,KACA,SACmB;EACnB,MAAM,EAAE,QAAQ,YAAY,MAAM,uBAAuB,SAAS;EAElE,IAAI;AACJ,MAAI,SAAS,eAAe,kBAC1B,WAAU,MAAM,QAAQ,EACtB,mBAAmB,SAAS,eAAe,mBAC5C,CAAC;MAEF,WAAU,MAAM,OAAO;GACrB,UAAU;GACV,iBAAiB;GACjB,mBAAmB,CAAC,uBAAuB;GAC3C,GAAG,SAAS;GACb,CAAC;EAEJ,MAAM,OAAO,MAAM,QAAQ,SAAS;AAEpC,QAAM,KAAK,KAAK,KAAK;GACnB,SAAS;GACT,WAAW;GACX,GAAG,SAAS;GACb,CAAC;AAIF,SAAO,IAAIA,0BAAAA,SAAS;GAAE,cAHH,MAAM,KAAK,YAAY,EAChB,SAAS,SAAS;GAED,UAD1B,EAAE,QAAQ,KAAK;GACqB,CAAC;;;;;;;;CASxD,MAAM,aAAgC;AACpC,SAAO,uBAAuB,YAAY,KAAK,SAAS,KAAK,QAAQ;;;;;;;CAQvE,aAAa,UAGV;AACD,MAAI;GACF,MAAM,EAAE,QAAQ,YAAY,MAAM,OAAO;AAEzC,UAAO;IAAE;IAAQ;IAAS;WACnB,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,+EACD"}