@lobehub/chat
Version:
Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.
276 lines (221 loc) • 9.48 kB
text/typescript
import { describe, expect, it } from 'vitest';
import { crawUrlRules } from '../urlRules';
describe('urlRules', () => {
it('should export an array of CrawlUrlRule objects', () => {
expect(Array.isArray(crawUrlRules)).toBe(true);
expect(crawUrlRules.length).toBeGreaterThan(0);
});
it('should have valid rule structure for each rule', () => {
crawUrlRules.forEach((rule, index) => {
expect(rule).toHaveProperty('urlPattern');
expect(typeof rule.urlPattern).toBe('string');
expect(rule.urlPattern.length).toBeGreaterThan(0);
if (rule.impls) {
expect(Array.isArray(rule.impls)).toBe(true);
expect(rule.impls.length).toBeGreaterThan(0);
}
if (rule.filterOptions) {
expect(typeof rule.filterOptions).toBe('object');
}
if (rule.urlTransform) {
expect(typeof rule.urlTransform).toBe('string');
expect(rule.urlTransform.length).toBeGreaterThan(0);
}
});
});
describe('specific URL patterns', () => {
it('should match WeChat Sogou links', () => {
const wechatRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://weixin.sogou.com/link(.*)',
);
expect(wechatRule).toBeDefined();
expect(wechatRule?.impls).toEqual(['search1api']);
});
it('should match Sogou links', () => {
const sogouRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://sogou.com/link(.*)',
);
expect(sogouRule).toBeDefined();
expect(sogouRule?.impls).toEqual(['search1api']);
});
it('should match YouTube links', () => {
const youtubeRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://www.youtube.com/watch(.*)',
);
expect(youtubeRule).toBeDefined();
expect(youtubeRule?.impls).toEqual(['search1api']);
});
it('should match Reddit links', () => {
const redditRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://www.reddit.com/r/(.*)/comments/(.*)',
);
expect(redditRule).toBeDefined();
expect(redditRule?.impls).toEqual(['search1api']);
});
it('should match WeChat public account links', () => {
const wechatPublicRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://mp.weixin.qq.com(.*)',
);
expect(wechatPublicRule).toBeDefined();
expect(wechatPublicRule?.impls).toEqual(['search1api', 'jina']);
});
it('should match GitHub blob links with URL transformation', () => {
const githubBlobRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://github.com/([^/]+)/([^/]+)/blob/([^/]+)/(.*)',
);
expect(githubBlobRule).toBeDefined();
expect(githubBlobRule?.impls).toEqual(['naive', 'jina']);
expect(githubBlobRule?.urlTransform).toBe('https://github.com/$1/$2/raw/refs/heads/$3/$4');
expect(githubBlobRule?.filterOptions?.enableReadability).toBe(false);
});
it('should match GitHub discussion links', () => {
const githubDiscussionRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://github.com/(.*)/discussions/(.*)',
);
expect(githubDiscussionRule).toBeDefined();
expect(githubDiscussionRule?.impls).toEqual(['naive', 'jina']);
expect(githubDiscussionRule?.filterOptions?.enableReadability).toBe(false);
});
it('should match PDF files', () => {
const pdfRule = crawUrlRules.find((rule) => rule.urlPattern === 'https://(.*).pdf');
expect(pdfRule).toBeDefined();
expect(pdfRule?.impls).toEqual(['jina']);
});
it('should match arXiv PDF links', () => {
const arxivRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://arxiv.org/pdf/(.*)',
);
expect(arxivRule).toBeDefined();
expect(arxivRule?.impls).toEqual(['jina']);
});
it('should match Zhihu Zhuanlan links', () => {
const zhihuZhuanlanRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://zhuanlan.zhihu.com(.*)',
);
expect(zhihuZhuanlanRule).toBeDefined();
expect(zhihuZhuanlanRule?.impls).toEqual(['jina']);
});
it('should match Zhihu links', () => {
const zhihuRule = crawUrlRules.find((rule) => rule.urlPattern === 'https://zhihu.com(.*)');
expect(zhihuRule).toBeDefined();
expect(zhihuRule?.impls).toEqual(['jina']);
});
it('should match Medium links with URL transformation', () => {
const mediumRule = crawUrlRules.find((rule) => rule.urlPattern === 'https://medium.com/(.*)');
expect(mediumRule).toBeDefined();
expect(mediumRule?.urlTransform).toBe('https://scribe.rip/$1');
});
it('should match Twitter/X links', () => {
const twitterRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://(twitter.com|x.com)/(.*)',
);
expect(twitterRule).toBeDefined();
expect(twitterRule?.impls).toEqual(['jina', 'browserless']);
expect(twitterRule?.filterOptions?.enableReadability).toBe(false);
});
it('should match sports data website with specific filter options', () => {
const sportsRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://www.qiumiwu.com/standings/(.*)',
);
expect(sportsRule).toBeDefined();
expect(sportsRule?.impls).toEqual(['naive']);
expect(sportsRule?.filterOptions?.enableReadability).toBe(false);
expect(sportsRule?.filterOptions?.pureText).toBe(true);
});
it('should match Mozilla Developer docs', () => {
const mozillaRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://developer.mozilla.org(.*)',
);
expect(mozillaRule).toBeDefined();
expect(mozillaRule?.impls).toEqual(['jina']);
});
it('should match CVPR links', () => {
const cvprRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://cvpr.thecvf.com(.*)',
);
expect(cvprRule).toBeDefined();
expect(cvprRule?.impls).toEqual(['jina']);
});
it('should match Feishu links', () => {
const feishuRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://(.*).feishu.cn/(.*)',
);
expect(feishuRule).toBeDefined();
expect(feishuRule?.impls).toEqual(['jina']);
});
it('should match Xiaohongshu (Little Red Book) links', () => {
const xiaohongshuRule = crawUrlRules.find(
(rule) => rule.urlPattern === 'https://(.*).xiaohongshu.com/(.*)',
);
expect(xiaohongshuRule).toBeDefined();
expect(xiaohongshuRule?.impls).toEqual(['search1api', 'jina']);
});
});
describe('URL pattern validation', () => {
it('should have valid regex patterns that can be compiled', () => {
crawUrlRules.forEach((rule, index) => {
expect(() => {
new RegExp(rule.urlPattern);
}).not.toThrow(`Rule at index ${index} should have valid regex pattern`);
});
});
});
describe('impl validation', () => {
const validImpls = ['naive', 'jina', 'browserless', 'search1api', 'exa', 'firecrawl', 'tavily'];
it('should only use valid crawler implementations', () => {
crawUrlRules.forEach((rule, index) => {
if (rule.impls) {
rule.impls.forEach((impl) => {
expect(validImpls).toContain(impl);
});
}
});
});
});
describe('filter options validation', () => {
it('should have valid filter options structure', () => {
crawUrlRules.forEach((rule, index) => {
if (rule.filterOptions) {
const { filterOptions } = rule;
if ('enableReadability' in filterOptions) {
expect(typeof filterOptions.enableReadability).toBe('boolean');
}
if ('pureText' in filterOptions) {
expect(typeof filterOptions.pureText).toBe('boolean');
}
}
});
});
});
describe('URL transformation validation', () => {
it('should have valid URL transformation templates', () => {
crawUrlRules.forEach((rule, index) => {
if (rule.urlTransform) {
// Check if URL transform contains placeholder groups
expect(rule.urlTransform).toMatch(/\$\d+/);
// Check if it's a valid URL format
expect(rule.urlTransform).toMatch(/^https?:\/\//);
}
});
});
});
describe('rule completeness', () => {
it('should cover major social media platforms', () => {
const patterns = crawUrlRules.map((rule) => rule.urlPattern);
// Check for major platforms
expect(patterns.some((p) => p.includes('youtube.com'))).toBe(true);
expect(patterns.some((p) => p.includes('reddit.com'))).toBe(true);
expect(patterns.some((p) => p.includes('twitter.com') || p.includes('x.com'))).toBe(true);
expect(patterns.some((p) => p.includes('medium.com'))).toBe(true);
expect(patterns.some((p) => p.includes('github.com'))).toBe(true);
});
it('should cover Chinese platforms', () => {
const patterns = crawUrlRules.map((rule) => rule.urlPattern);
// Check for Chinese platforms
expect(patterns.some((p) => p.includes('weixin.sogou.com'))).toBe(true);
expect(patterns.some((p) => p.includes('zhihu.com'))).toBe(true);
expect(patterns.some((p) => p.includes('xiaohongshu.com'))).toBe(true);
expect(patterns.some((p) => p.includes('feishu.cn'))).toBe(true);
});
});
});