@lobehub/chat
Version:
Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.
68 lines (58 loc) • 1.9 kB
text/typescript
import qs from 'query-string';
import urlJoin from 'url-join';
import { CrawlImpl, CrawlSuccessResult } from '../type';
import { htmlToMarkdown } from '../utils/htmlToMarkdown';
const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
// Allowed file types: html, css, js, json, xml, webmanifest, txt, md
const REJECT_REQUEST_PATTERN =
'.*\\.(?!(html|css|js|json|xml|webmanifest|txt|md)(\\?|#|$))[\\w-]+(?:[\\?#].*)?$';
const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN;
class BrowserlessInitError extends Error {
constructor() {
super('`BROWSERLESS_URL` or `BROWSERLESS_TOKEN` are required');
this.name = 'BrowserlessInitError';
}
}
export const browserless: CrawlImpl = async (url, { filterOptions }) => {
if (!process.env.BROWSERLESS_URL && !process.env.BROWSERLESS_TOKEN) {
throw new BrowserlessInitError();
}
const input = {
gotoOptions: { waitUntil: 'networkidle2' },
rejectRequestPattern: [REJECT_REQUEST_PATTERN],
url,
};
try {
const res = await fetch(
qs.stringifyUrl({ query: { token: BROWSERLESS_TOKEN }, url: urlJoin(BASE_URL, '/content') }),
{
body: JSON.stringify(input),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
},
);
const html = await res.text();
const result = htmlToMarkdown(html, { filterOptions, url });
if (
!!result.content &&
result.title &&
// Just a moment... 说明被 CF 拦截了
result.title.trim() !== 'Just a moment...'
) {
return {
content: result.content,
contentType: 'text',
description: result?.description,
length: result.length,
siteName: result?.siteName,
title: result?.title,
url,
} satisfies CrawlSuccessResult;
}
} catch (error) {
console.error(error);
}
return;
};