@lobehub/chat
Version:
Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.
208 lines (172 loc) • 6.01 kB
text/typescript
import { describe, expect, it, vi } from 'vitest';
import { Crawler } from '../crawler';
// Move mocks outside of test cases to avoid hoisting issues
vi.mock('../crawImpl', () => ({
crawlImpls: {
naive: vi.fn(),
jina: vi.fn(),
browserless: vi.fn(),
},
}));
vi.mock('../utils/appUrlRules', () => ({
applyUrlRules: vi.fn().mockReturnValue({
transformedUrl: 'https://example.com',
filterOptions: {},
}),
}));
describe('Crawler', () => {
const crawler = new Crawler();
it('should crawl successfully with default impls', async () => {
const mockResult = {
content: 'test content'.padEnd(101, ' '), // Ensure content length > 100
contentType: 'text' as const,
url: 'https://example.com',
};
const { crawlImpls } = await import('../crawImpl');
vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult);
const result = await crawler.crawl({
url: 'https://example.com',
});
expect(result).toEqual({
crawler: 'naive',
data: mockResult,
originalUrl: 'https://example.com',
transformedUrl: undefined,
});
});
it('should use user provided impls', async () => {
const mockResult = {
content: 'test content'.padEnd(101, ' '), // Ensure content length > 100
contentType: 'text' as const,
url: 'https://example.com',
};
const { crawlImpls } = await import('../crawImpl');
vi.mocked(crawlImpls.jina).mockResolvedValue(mockResult);
const result = await crawler.crawl({
impls: ['jina'],
url: 'https://example.com',
});
expect(result).toEqual({
crawler: 'jina',
data: mockResult,
originalUrl: 'https://example.com',
transformedUrl: undefined,
});
});
it('should handle crawl errors', async () => {
const mockError = new Error('Crawl failed');
mockError.name = 'CrawlError';
const { crawlImpls } = await import('../crawImpl');
vi.mocked(crawlImpls.naive).mockRejectedValue(mockError);
vi.mocked(crawlImpls.jina).mockRejectedValue(mockError);
vi.mocked(crawlImpls.browserless).mockRejectedValue(mockError);
const result = await crawler.crawl({
url: 'https://example.com',
});
expect(result).toEqual({
crawler: 'browserless',
data: {
content: 'Fail to crawl the page. Error type: CrawlError, error message: Crawl failed',
errorMessage: 'Crawl failed',
errorType: 'CrawlError',
},
originalUrl: 'https://example.com',
transformedUrl: undefined,
});
});
it('should handle transformed urls', async () => {
const mockResult = {
content: 'test content'.padEnd(101, ' '), // Ensure content length > 100
contentType: 'text' as const,
url: 'https://transformed.example.com',
};
const { crawlImpls } = await import('../crawImpl');
vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult);
const { applyUrlRules } = await import('../utils/appUrlRules');
vi.mocked(applyUrlRules).mockReturnValue({
transformedUrl: 'https://transformed.example.com',
filterOptions: {},
});
const result = await crawler.crawl({
url: 'https://example.com',
});
expect(result).toEqual({
crawler: 'naive',
data: mockResult,
originalUrl: 'https://example.com',
transformedUrl: 'https://transformed.example.com',
});
});
it('should merge filter options correctly', async () => {
const mockResult = {
content: 'test content'.padEnd(101, ' '), // Ensure content length > 100
contentType: 'text' as const,
url: 'https://example.com',
};
const { crawlImpls } = await import('../crawImpl');
const mockCrawlImpl = vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult);
const { applyUrlRules } = await import('../utils/appUrlRules');
vi.mocked(applyUrlRules).mockReturnValue({
transformedUrl: 'https://example.com',
filterOptions: { pureText: true },
});
await crawler.crawl({
url: 'https://example.com',
filterOptions: { enableReadability: true },
});
expect(mockCrawlImpl).toHaveBeenCalledWith('https://example.com', {
filterOptions: {
pureText: true,
enableReadability: true,
},
});
});
it('should use rule impls when provided', async () => {
const mockResult = {
content: 'test content'.padEnd(101, ' '), // Ensure content length > 100
contentType: 'text' as const,
url: 'https://example.com',
};
const { crawlImpls } = await import('../crawImpl');
vi.mocked(crawlImpls.jina).mockResolvedValue(mockResult);
const { applyUrlRules } = await import('../utils/appUrlRules');
vi.mocked(applyUrlRules).mockReturnValue({
transformedUrl: 'https://example.com',
filterOptions: {},
impls: ['jina'],
});
const result = await crawler.crawl({
url: 'https://example.com',
});
expect(result).toEqual({
crawler: 'jina',
data: mockResult,
originalUrl: 'https://example.com',
transformedUrl: undefined,
});
});
it('should skip results with content length <= 100', async () => {
const mockResult = {
content: 'short content', // Content length <= 100
contentType: 'text' as const,
url: 'https://example.com',
};
const { crawlImpls } = await import('../crawImpl');
vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult);
vi.mocked(crawlImpls.jina).mockResolvedValue(mockResult);
vi.mocked(crawlImpls.browserless).mockResolvedValue(mockResult);
const result = await crawler.crawl({
url: 'https://example.com',
});
expect(result).toEqual({
crawler: undefined,
data: {
content: 'Fail to crawl the page. Error type: UnknownError, error message: undefined',
errorMessage: undefined,
errorType: 'UnknownError',
},
originalUrl: 'https://example.com',
transformedUrl: undefined,
});
});
});