UNPKG

@lobehub/chat

Version:

Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.

208 lines (172 loc) • 6.01 kB
import { describe, expect, it, vi } from 'vitest'; import { Crawler } from '../crawler'; // Move mocks outside of test cases to avoid hoisting issues vi.mock('../crawImpl', () => ({ crawlImpls: { naive: vi.fn(), jina: vi.fn(), browserless: vi.fn(), }, })); vi.mock('../utils/appUrlRules', () => ({ applyUrlRules: vi.fn().mockReturnValue({ transformedUrl: 'https://example.com', filterOptions: {}, }), })); describe('Crawler', () => { const crawler = new Crawler(); it('should crawl successfully with default impls', async () => { const mockResult = { content: 'test content'.padEnd(101, ' '), // Ensure content length > 100 contentType: 'text' as const, url: 'https://example.com', }; const { crawlImpls } = await import('../crawImpl'); vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult); const result = await crawler.crawl({ url: 'https://example.com', }); expect(result).toEqual({ crawler: 'naive', data: mockResult, originalUrl: 'https://example.com', transformedUrl: undefined, }); }); it('should use user provided impls', async () => { const mockResult = { content: 'test content'.padEnd(101, ' '), // Ensure content length > 100 contentType: 'text' as const, url: 'https://example.com', }; const { crawlImpls } = await import('../crawImpl'); vi.mocked(crawlImpls.jina).mockResolvedValue(mockResult); const result = await crawler.crawl({ impls: ['jina'], url: 'https://example.com', }); expect(result).toEqual({ crawler: 'jina', data: mockResult, originalUrl: 'https://example.com', transformedUrl: undefined, }); }); it('should handle crawl errors', async () => { const mockError = new Error('Crawl failed'); mockError.name = 'CrawlError'; const { crawlImpls } = await import('../crawImpl'); vi.mocked(crawlImpls.naive).mockRejectedValue(mockError); vi.mocked(crawlImpls.jina).mockRejectedValue(mockError); vi.mocked(crawlImpls.browserless).mockRejectedValue(mockError); const result = await crawler.crawl({ url: 'https://example.com', }); expect(result).toEqual({ crawler: 'browserless', data: { content: 'Fail to crawl the page. Error type: CrawlError, error message: Crawl failed', errorMessage: 'Crawl failed', errorType: 'CrawlError', }, originalUrl: 'https://example.com', transformedUrl: undefined, }); }); it('should handle transformed urls', async () => { const mockResult = { content: 'test content'.padEnd(101, ' '), // Ensure content length > 100 contentType: 'text' as const, url: 'https://transformed.example.com', }; const { crawlImpls } = await import('../crawImpl'); vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult); const { applyUrlRules } = await import('../utils/appUrlRules'); vi.mocked(applyUrlRules).mockReturnValue({ transformedUrl: 'https://transformed.example.com', filterOptions: {}, }); const result = await crawler.crawl({ url: 'https://example.com', }); expect(result).toEqual({ crawler: 'naive', data: mockResult, originalUrl: 'https://example.com', transformedUrl: 'https://transformed.example.com', }); }); it('should merge filter options correctly', async () => { const mockResult = { content: 'test content'.padEnd(101, ' '), // Ensure content length > 100 contentType: 'text' as const, url: 'https://example.com', }; const { crawlImpls } = await import('../crawImpl'); const mockCrawlImpl = vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult); const { applyUrlRules } = await import('../utils/appUrlRules'); vi.mocked(applyUrlRules).mockReturnValue({ transformedUrl: 'https://example.com', filterOptions: { pureText: true }, }); await crawler.crawl({ url: 'https://example.com', filterOptions: { enableReadability: true }, }); expect(mockCrawlImpl).toHaveBeenCalledWith('https://example.com', { filterOptions: { pureText: true, enableReadability: true, }, }); }); it('should use rule impls when provided', async () => { const mockResult = { content: 'test content'.padEnd(101, ' '), // Ensure content length > 100 contentType: 'text' as const, url: 'https://example.com', }; const { crawlImpls } = await import('../crawImpl'); vi.mocked(crawlImpls.jina).mockResolvedValue(mockResult); const { applyUrlRules } = await import('../utils/appUrlRules'); vi.mocked(applyUrlRules).mockReturnValue({ transformedUrl: 'https://example.com', filterOptions: {}, impls: ['jina'], }); const result = await crawler.crawl({ url: 'https://example.com', }); expect(result).toEqual({ crawler: 'jina', data: mockResult, originalUrl: 'https://example.com', transformedUrl: undefined, }); }); it('should skip results with content length <= 100', async () => { const mockResult = { content: 'short content', // Content length <= 100 contentType: 'text' as const, url: 'https://example.com', }; const { crawlImpls } = await import('../crawImpl'); vi.mocked(crawlImpls.naive).mockResolvedValue(mockResult); vi.mocked(crawlImpls.jina).mockResolvedValue(mockResult); vi.mocked(crawlImpls.browserless).mockResolvedValue(mockResult); const result = await crawler.crawl({ url: 'https://example.com', }); expect(result).toEqual({ crawler: undefined, data: { content: 'Fail to crawl the page. Error type: UnknownError, error message: undefined', errorMessage: undefined, errorType: 'UnknownError', }, originalUrl: 'https://example.com', transformedUrl: undefined, }); }); });