UNPKG

@lobehub/chat

Version:

Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.

68 lines (58 loc) 1.9 kB
import qs from 'query-string'; import urlJoin from 'url-join'; import { CrawlImpl, CrawlSuccessResult } from '../type'; import { htmlToMarkdown } from '../utils/htmlToMarkdown'; const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io'; // Allowed file types: html, css, js, json, xml, webmanifest, txt, md const REJECT_REQUEST_PATTERN = '.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[\\?#].*)?$'; const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN; class BrowserlessInitError extends Error { constructor() { super('`BROWSERLESS_URL` or `BROWSERLESS_TOKEN` are required'); this.name = 'BrowserlessInitError'; } } export const browserless: CrawlImpl = async (url, { filterOptions }) => { if (!process.env.BROWSERLESS_URL && !process.env.BROWSERLESS_TOKEN) { throw new BrowserlessInitError(); } const input = { gotoOptions: { waitUntil: 'networkidle2' }, rejectRequestPattern: [REJECT_REQUEST_PATTERN], url, }; try { const res = await fetch( qs.stringifyUrl({ query: { token: BROWSERLESS_TOKEN }, url: urlJoin(BASE_URL, '/content') }), { body: JSON.stringify(input), headers: { 'Content-Type': 'application/json', }, method: 'POST', }, ); const html = await res.text(); const result = htmlToMarkdown(html, { filterOptions, url }); if ( !!result.content && result.title && // Just a moment... 说明被 CF 拦截了 result.title.trim() !== 'Just a moment...' ) { return { content: result.content, contentType: 'text', description: result?.description, length: result.length, siteName: result?.siteName, title: result?.title, url, } satisfies CrawlSuccessResult; } } catch (error) { console.error(error); } return; };