@boundless-oss/atlas

import { describe, it, expect, beforeEach } from 'vitest'; import { MarkdownDocumentProcessor } from '../document-processor.js'; import type { RAGDocument, RAGChunk, DocumentMetadata } from '../types.js'; describe('MarkdownDocumentProcessor', () => { let processor: MarkdownDocumentProcessor; beforeEach(() => { processor = new MarkdownDocumentProcessor(); }); describe('parse', () => { it('should parse a simple markdown document', async () => { const content = `# Test Document This is a test document with some content. ## Section 1 This is section 1 content.`; const doc = await processor.parse(content, '/test/doc.md'); expect(doc.id).toMatch(/^doc-/); expect(doc.path).toBe('/test/doc.md'); expect(doc.content).toBe(content); expect(doc.metadata.title).toBe('Test Document'); expect(doc.metadata.size).toBe(content.length); }); it('should extract frontmatter metadata', async () => { const content = `--- title: Custom Title author: John Doe tags: [tutorial, guide] version: 1.0.0 --- # Test Document Content here.`; const doc = await processor.parse(content, '/test/doc.md'); expect(doc.metadata.title).toBe('Custom Title'); expect(doc.metadata.author).toBe('John Doe'); expect(doc.metadata.tags).toEqual(['tutorial', 'guide']); expect(doc.metadata.version).toBe('1.0.0'); }); it('should handle documents without headers', async () => { const content = `This is just a paragraph. Another paragraph here.`; const doc = await processor.parse(content, '/test/doc.md'); expect(doc.metadata.title).toBe('doc'); expect(doc.content).toBe(content); }); it('should extract code blocks', async () => { const content = `# Code Example Here's some code: \`\`\`typescript function hello() { console.log('Hello, world!'); } \`\`\` More text here.`; const doc = await processor.parse(content, '/test/doc.md'); expect(doc.content).toContain('```typescript'); expect(doc.metadata.title).toBe('Code Example'); }); }); describe('chunk', () => { it('should chunk by paragraphs', async () => { const doc: RAGDocument = { id: 'doc-123', path: '/test/doc.md', content: `# Title First paragraph here. Second paragraph here. Third paragraph here.`, metadata: { title: 'Title', lastModified: new Date().toISOString(), size: 100 } }; const chunks = await processor.chunk(doc, 100, 10); expect(chunks).toHaveLength(4); // Header + 3 paragraphs expect(chunks[0].content).toBe('# Title'); expect(chunks[0].metadata.type).toBe('header'); expect(chunks[1].content).toBe('First paragraph here.'); expect(chunks[1].metadata.type).toBe('paragraph'); }); it('should respect chunk size limits', async () => { const longParagraph = 'This is a very long paragraph. '.repeat(20); const doc: RAGDocument = { id: 'doc-456', path: '/test/doc.md', content: `# Title ${longParagraph} Short paragraph.`, metadata: { title: 'Title', lastModified: new Date().toISOString(), size: 1000 } }; const chunks = await processor.chunk(doc, 100, 20); // Long paragraph should be split const longChunks = chunks.filter(c => c.content.includes('very long paragraph')); expect(longChunks.length).toBeGreaterThan(1); // Each chunk should respect size limit (with some tolerance for word boundaries) chunks.forEach(chunk => { expect(chunk.content.length).toBeLessThanOrEqual(120); // 100 + 20 overlap tolerance }); }); it('should handle chunk overlap', async () => { const doc: RAGDocument = { id: 'doc-789', path: '/test/doc.md', content: `First sentence. Second sentence. Third sentence. Fourth sentence.`, metadata: { title: 'Test', lastModified: new Date().toISOString(), size: 100 } }; const chunks = await processor.chunk(doc, 30, 10); // Check that chunks overlap expect(chunks.length).toBeGreaterThan(1); // Find overlapping content between consecutive chunks for (let i = 0; i < chunks.length - 1; i++) { const currentEnd = chunks[i].content.slice(-10); const nextStart = chunks[i + 1].content.slice(0, 10); // There should be some overlap (not exact due to word boundaries) const hasOverlap = chunks[i].content.includes(chunks[i + 1].content.split(' ')[0]); expect(hasOverlap || i === chunks.length - 2).toBe(true); } }); it('should preserve code blocks as single chunks', async () => { const doc: RAGDocument = { id: 'doc-code', path: '/test/code.md', content: `# Code Example Some text before. \`\`\`typescript function longFunction() { // This is a long code block // that should not be split const a = 1; const b = 2; return a + b; } \`\`\` Some text after.`, metadata: { title: 'Code Example', lastModified: new Date().toISOString(), size: 300 } }; const chunks = await processor.chunk(doc, 50, 10); const codeChunk = chunks.find(c => c.metadata.type === 'code'); expect(codeChunk).toBeDefined(); expect(codeChunk!.content).toContain('function longFunction()'); expect(codeChunk!.metadata.language).toBe('typescript'); }); it('should handle nested headers', async () => { const doc: RAGDocument = { id: 'doc-nested', path: '/test/nested.md', content: `# Main Title ## Section 1 Content for section 1. ### Subsection 1.1 Content for subsection. ## Section 2 Content for section 2.`, metadata: { title: 'Main Title', lastModified: new Date().toISOString(), size: 200 } }; const chunks = await processor.chunk(doc, 100, 10); const headers = chunks.filter(c => c.metadata.type === 'header'); expect(headers).toHaveLength(4); expect(headers[0].metadata.level).toBe(1); expect(headers[1].metadata.level).toBe(2); expect(headers[2].metadata.level).toBe(3); expect(headers[3].metadata.level).toBe(2); }); it('should generate unique chunk IDs', async () => { const doc: RAGDocument = { id: 'doc-unique', path: '/test/unique.md', content: `Para 1 Para 2 Para 3`, metadata: { title: 'Test', lastModified: new Date().toISOString(), size: 50 } }; const chunks = await processor.chunk(doc, 100, 10); const ids = chunks.map(c => c.id); const uniqueIds = new Set(ids); expect(uniqueIds.size).toBe(ids.length); }); it('should maintain chunk order with indices', async () => { const doc: RAGDocument = { id: 'doc-order', path: '/test/order.md', content: `# Title First Second Third`, metadata: { title: 'Title', lastModified: new Date().toISOString(), size: 50 } }; const chunks = await processor.chunk(doc, 100, 10); chunks.forEach((chunk, i) => { expect(chunk.index).toBe(i); }); }); }); describe('extractMetadata', () => { it('should extract basic metadata from path', async () => { const content = '# Simple Doc'; const metadata = await processor.extractMetadata(content, '/docs/guide.md'); expect(metadata.title).toBe('Simple Doc'); expect(metadata.size).toBe(content.length); expect(metadata.lastModified).toBeDefined(); }); it('should prioritize frontmatter metadata', async () => { const content = `--- title: Frontmatter Title author: Jane Smith tags: - api - reference custom: category: technical --- # Different Title`; const metadata = await processor.extractMetadata(content, '/test.md'); expect(metadata.title).toBe('Frontmatter Title'); expect(metadata.author).toBe('Jane Smith'); expect(metadata.tags).toEqual(['api', 'reference']); expect(metadata.custom).toEqual({ category: 'technical' }); }); it('should handle empty content', async () => { const metadata = await processor.extractMetadata('', '/empty.md'); expect(metadata.title).toBe('empty'); expect(metadata.size).toBe(0); }); it('should extract title from first header if no frontmatter', async () => { const content = `Some intro text # Actual Title Here More content`; const metadata = await processor.extractMetadata(content, '/test.md'); expect(metadata.title).toBe('Actual Title Here'); }); it('should handle malformed frontmatter gracefully', async () => { const content = `--- title: Incomplete author: --- # Doc`; const metadata = await processor.extractMetadata(content, '/test.md'); expect(metadata.title).toBe('Incomplete'); expect(metadata.author).toBeUndefined(); }); }); describe('edge cases', () => { it('should handle empty documents', async () => { const doc = await processor.parse('', '/empty.md'); expect(doc.content).toBe(''); expect(doc.metadata.title).toBe('empty'); const chunks = await processor.chunk(doc, 100, 10); expect(chunks).toHaveLength(0); }); it('should handle very large documents', async () => { const largeContent = 'Large paragraph. '.repeat(1000); const doc: RAGDocument = { id: 'doc-large', path: '/large.md', content: largeContent, metadata: { title: 'Large', lastModified: new Date().toISOString(), size: largeContent.length } }; const chunks = await processor.chunk(doc, 200, 20); expect(chunks.length).toBeGreaterThan(10); chunks.forEach(chunk => { expect(chunk.content.length).toBeLessThanOrEqual(220); expect(chunk.documentId).toBe('doc-large'); }); }); it('should handle special markdown characters', async () => { const content = `# Title with **bold** and *italic* List: - Item 1 - Item 2 > Blockquote here [Link](https://example.com)`; const doc = await processor.parse(content, '/special.md'); expect(doc.metadata.title).toBe('Title with **bold** and *italic*'); const chunks = await processor.chunk(doc, 100, 10); const blockquoteChunk = chunks.find(c => c.content.includes('> Blockquote')); expect(blockquoteChunk?.metadata.type).toBe('blockquote'); }); }); });