UNPKG

@langchain/community

Version:
1 lines 4.69 kB
{"version":3,"file":"playwright.cjs","names":["BaseDocumentLoader","Document"],"sources":["../../../src/document_loaders/web/playwright.ts"],"sourcesContent":["import type { LaunchOptions, Page, Browser, Response } from \"playwright\";\n\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\nimport type { DocumentLoader } from \"@langchain/core/document_loaders/base\";\n\nexport { Page, Browser, Response };\n\nexport type PlaywrightGotoOptions = {\n referer?: string;\n timeout?: number;\n waitUntil?: \"load\" | \"domcontentloaded\" | \"networkidle\" | \"commit\";\n};\n\n/**\n * Type representing a function for evaluating JavaScript code on a web\n * page using Playwright. Takes a Page, Browser, and Response object as\n * parameters and returns a Promise that resolves to a string.\n */\nexport type PlaywrightEvaluate = (\n page: Page,\n browser: Browser,\n response: Response | null\n) => Promise<string>;\n\nexport type PlaywrightWebBaseLoaderOptions = {\n launchOptions?: LaunchOptions;\n gotoOptions?: PlaywrightGotoOptions;\n evaluate?: PlaywrightEvaluate;\n};\n\n/**\n * Class representing a document loader for scraping web pages using\n * Playwright. Extends the BaseDocumentLoader class and implements the\n * DocumentLoader interface.\n */\nexport class PlaywrightWebBaseLoader\n extends BaseDocumentLoader\n implements DocumentLoader\n{\n options: PlaywrightWebBaseLoaderOptions | undefined;\n\n constructor(\n public webPath: string,\n options?: PlaywrightWebBaseLoaderOptions\n ) {\n super();\n this.options = options ?? undefined;\n }\n\n static async _scrape(\n url: string,\n options?: PlaywrightWebBaseLoaderOptions\n ): Promise<string> {\n const { chromium } = await PlaywrightWebBaseLoader.imports();\n\n const browser = await chromium.launch({\n headless: true,\n ...options?.launchOptions,\n });\n const page = await browser.newPage();\n\n const response = await page.goto(url, {\n timeout: 180000,\n waitUntil: \"domcontentloaded\",\n ...options?.gotoOptions,\n });\n const bodyHTML = options?.evaluate\n ? await options?.evaluate(page, browser, response)\n : await page.content();\n\n await browser.close();\n\n return bodyHTML;\n }\n\n /**\n * Method that calls the _scrape method to perform the scraping of the web\n * page specified by the webPath property. Returns a Promise that resolves\n * to the scraped HTML content of the web page.\n * @returns Promise that resolves to the scraped HTML content of the web page.\n */\n async scrape(): Promise<string> {\n return PlaywrightWebBaseLoader._scrape(this.webPath, this.options);\n }\n\n /**\n * Method that calls the scrape method and returns the scraped HTML\n * content as a Document object. Returns a Promise that resolves to an\n * array of Document objects.\n * @returns Promise that resolves to an array of Document objects.\n */\n async load(): Promise<Document[]> {\n const text = await this.scrape();\n\n const metadata = { source: this.webPath };\n return [new Document({ pageContent: text, metadata })];\n }\n\n /**\n * Static method that imports the necessary Playwright modules. Returns a\n * Promise that resolves to an object containing the imported modules.\n * @returns Promise that resolves to an object containing the imported modules.\n */\n static async imports(): Promise<{\n chromium: typeof import(\"playwright\").chromium;\n }> {\n try {\n const { chromium } = await import(\"playwright\");\n\n return { chromium };\n } catch (e) {\n console.error(e);\n throw new Error(\n \"Please install playwright as a dependency with, e.g. `pnpm install playwright`\"\n );\n }\n }\n}\n"],"mappings":";;;;;;;;;;;AAoCA,IAAa,0BAAb,MAAa,gCACHA,sCAAAA,mBAEV;CACE;CAEA,YACE,SACA,SACA;AACA,SAAO;AAHA,OAAA,UAAA;AAIP,OAAK,UAAU,WAAW,KAAA;;CAG5B,aAAa,QACX,KACA,SACiB;EACjB,MAAM,EAAE,aAAa,MAAM,wBAAwB,SAAS;EAE5D,MAAM,UAAU,MAAM,SAAS,OAAO;GACpC,UAAU;GACV,GAAG,SAAS;GACb,CAAC;EACF,MAAM,OAAO,MAAM,QAAQ,SAAS;EAEpC,MAAM,WAAW,MAAM,KAAK,KAAK,KAAK;GACpC,SAAS;GACT,WAAW;GACX,GAAG,SAAS;GACb,CAAC;EACF,MAAM,WAAW,SAAS,WACtB,MAAM,SAAS,SAAS,MAAM,SAAS,SAAS,GAChD,MAAM,KAAK,SAAS;AAExB,QAAM,QAAQ,OAAO;AAErB,SAAO;;;;;;;;CAST,MAAM,SAA0B;AAC9B,SAAO,wBAAwB,QAAQ,KAAK,SAAS,KAAK,QAAQ;;;;;;;;CASpE,MAAM,OAA4B;AAIhC,SAAO,CAAC,IAAIC,0BAAAA,SAAS;GAAE,aAHV,MAAM,KAAK,QAAQ;GAGU,UADzB,EAAE,QAAQ,KAAK,SAAS;GACW,CAAC,CAAC;;;;;;;CAQxD,aAAa,UAEV;AACD,MAAI;GACF,MAAM,EAAE,aAAa,MAAM,OAAO;AAElC,UAAO,EAAE,UAAU;WACZ,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,iFACD"}