@boundless-oss/atlas
Version:
Atlas - MCP Server for comprehensive startup project management
401 lines (306 loc) • 10.8 kB
text/typescript
import { describe, it, expect, beforeEach } from 'vitest';
import { MarkdownDocumentProcessor } from '../document-processor.js';
import type { RAGDocument, RAGChunk, DocumentMetadata } from '../types.js';
describe('MarkdownDocumentProcessor', () => {
let processor: MarkdownDocumentProcessor;
beforeEach(() => {
processor = new MarkdownDocumentProcessor();
});
describe('parse', () => {
it('should parse a simple markdown document', async () => {
const content = `# Test Document
This is a test document with some content.
## Section 1
This is section 1 content.`;
const doc = await processor.parse(content, '/test/doc.md');
expect(doc.id).toMatch(/^doc-/);
expect(doc.path).toBe('/test/doc.md');
expect(doc.content).toBe(content);
expect(doc.metadata.title).toBe('Test Document');
expect(doc.metadata.size).toBe(content.length);
});
it('should extract frontmatter metadata', async () => {
const content = `---
title: Custom Title
author: John Doe
tags: [tutorial, guide]
version: 1.0.0
---
# Test Document
Content here.`;
const doc = await processor.parse(content, '/test/doc.md');
expect(doc.metadata.title).toBe('Custom Title');
expect(doc.metadata.author).toBe('John Doe');
expect(doc.metadata.tags).toEqual(['tutorial', 'guide']);
expect(doc.metadata.version).toBe('1.0.0');
});
it('should handle documents without headers', async () => {
const content = `This is just a paragraph.
Another paragraph here.`;
const doc = await processor.parse(content, '/test/doc.md');
expect(doc.metadata.title).toBe('doc');
expect(doc.content).toBe(content);
});
it('should extract code blocks', async () => {
const content = `# Code Example
Here's some code:
\`\`\`typescript
function hello() {
console.log('Hello, world!');
}
\`\`\`
More text here.`;
const doc = await processor.parse(content, '/test/doc.md');
expect(doc.content).toContain('```typescript');
expect(doc.metadata.title).toBe('Code Example');
});
});
describe('chunk', () => {
it('should chunk by paragraphs', async () => {
const doc: RAGDocument = {
id: 'doc-123',
path: '/test/doc.md',
content: `# Title
First paragraph here.
Second paragraph here.
Third paragraph here.`,
metadata: {
title: 'Title',
lastModified: new Date().toISOString(),
size: 100
}
};
const chunks = await processor.chunk(doc, 100, 10);
expect(chunks).toHaveLength(4); // Header + 3 paragraphs
expect(chunks[0].content).toBe('# Title');
expect(chunks[0].metadata.type).toBe('header');
expect(chunks[1].content).toBe('First paragraph here.');
expect(chunks[1].metadata.type).toBe('paragraph');
});
it('should respect chunk size limits', async () => {
const longParagraph = 'This is a very long paragraph. '.repeat(20);
const doc: RAGDocument = {
id: 'doc-456',
path: '/test/doc.md',
content: `# Title
${longParagraph}
Short paragraph.`,
metadata: {
title: 'Title',
lastModified: new Date().toISOString(),
size: 1000
}
};
const chunks = await processor.chunk(doc, 100, 20);
// Long paragraph should be split
const longChunks = chunks.filter(c => c.content.includes('very long paragraph'));
expect(longChunks.length).toBeGreaterThan(1);
// Each chunk should respect size limit (with some tolerance for word boundaries)
chunks.forEach(chunk => {
expect(chunk.content.length).toBeLessThanOrEqual(120); // 100 + 20 overlap tolerance
});
});
it('should handle chunk overlap', async () => {
const doc: RAGDocument = {
id: 'doc-789',
path: '/test/doc.md',
content: `First sentence. Second sentence. Third sentence. Fourth sentence.`,
metadata: {
title: 'Test',
lastModified: new Date().toISOString(),
size: 100
}
};
const chunks = await processor.chunk(doc, 30, 10);
// Check that chunks overlap
expect(chunks.length).toBeGreaterThan(1);
// Find overlapping content between consecutive chunks
for (let i = 0; i < chunks.length - 1; i++) {
const currentEnd = chunks[i].content.slice(-10);
const nextStart = chunks[i + 1].content.slice(0, 10);
// There should be some overlap (not exact due to word boundaries)
const hasOverlap = chunks[i].content.includes(chunks[i + 1].content.split(' ')[0]);
expect(hasOverlap || i === chunks.length - 2).toBe(true);
}
});
it('should preserve code blocks as single chunks', async () => {
const doc: RAGDocument = {
id: 'doc-code',
path: '/test/code.md',
content: `# Code Example
Some text before.
\`\`\`typescript
function longFunction() {
// This is a long code block
// that should not be split
const a = 1;
const b = 2;
return a + b;
}
\`\`\`
Some text after.`,
metadata: {
title: 'Code Example',
lastModified: new Date().toISOString(),
size: 300
}
};
const chunks = await processor.chunk(doc, 50, 10);
const codeChunk = chunks.find(c => c.metadata.type === 'code');
expect(codeChunk).toBeDefined();
expect(codeChunk!.content).toContain('function longFunction()');
expect(codeChunk!.metadata.language).toBe('typescript');
});
it('should handle nested headers', async () => {
const doc: RAGDocument = {
id: 'doc-nested',
path: '/test/nested.md',
content: `# Main Title
## Section 1
Content for section 1.
### Subsection 1.1
Content for subsection.
## Section 2
Content for section 2.`,
metadata: {
title: 'Main Title',
lastModified: new Date().toISOString(),
size: 200
}
};
const chunks = await processor.chunk(doc, 100, 10);
const headers = chunks.filter(c => c.metadata.type === 'header');
expect(headers).toHaveLength(4);
expect(headers[0].metadata.level).toBe(1);
expect(headers[1].metadata.level).toBe(2);
expect(headers[2].metadata.level).toBe(3);
expect(headers[3].metadata.level).toBe(2);
});
it('should generate unique chunk IDs', async () => {
const doc: RAGDocument = {
id: 'doc-unique',
path: '/test/unique.md',
content: `Para 1
Para 2
Para 3`,
metadata: {
title: 'Test',
lastModified: new Date().toISOString(),
size: 50
}
};
const chunks = await processor.chunk(doc, 100, 10);
const ids = chunks.map(c => c.id);
const uniqueIds = new Set(ids);
expect(uniqueIds.size).toBe(ids.length);
});
it('should maintain chunk order with indices', async () => {
const doc: RAGDocument = {
id: 'doc-order',
path: '/test/order.md',
content: `# Title
First
Second
Third`,
metadata: {
title: 'Title',
lastModified: new Date().toISOString(),
size: 50
}
};
const chunks = await processor.chunk(doc, 100, 10);
chunks.forEach((chunk, i) => {
expect(chunk.index).toBe(i);
});
});
});
describe('extractMetadata', () => {
it('should extract basic metadata from path', async () => {
const content = '# Simple Doc';
const metadata = await processor.extractMetadata(content, '/docs/guide.md');
expect(metadata.title).toBe('Simple Doc');
expect(metadata.size).toBe(content.length);
expect(metadata.lastModified).toBeDefined();
});
it('should prioritize frontmatter metadata', async () => {
const content = `---
title: Frontmatter Title
author: Jane Smith
tags:
- api
- reference
custom:
category: technical
---
# Different Title`;
const metadata = await processor.extractMetadata(content, '/test.md');
expect(metadata.title).toBe('Frontmatter Title');
expect(metadata.author).toBe('Jane Smith');
expect(metadata.tags).toEqual(['api', 'reference']);
expect(metadata.custom).toEqual({ category: 'technical' });
});
it('should handle empty content', async () => {
const metadata = await processor.extractMetadata('', '/empty.md');
expect(metadata.title).toBe('empty');
expect(metadata.size).toBe(0);
});
it('should extract title from first header if no frontmatter', async () => {
const content = `Some intro text
# Actual Title Here
More content`;
const metadata = await processor.extractMetadata(content, '/test.md');
expect(metadata.title).toBe('Actual Title Here');
});
it('should handle malformed frontmatter gracefully', async () => {
const content = `---
title: Incomplete
author:
---
# Doc`;
const metadata = await processor.extractMetadata(content, '/test.md');
expect(metadata.title).toBe('Incomplete');
expect(metadata.author).toBeUndefined();
});
});
describe('edge cases', () => {
it('should handle empty documents', async () => {
const doc = await processor.parse('', '/empty.md');
expect(doc.content).toBe('');
expect(doc.metadata.title).toBe('empty');
const chunks = await processor.chunk(doc, 100, 10);
expect(chunks).toHaveLength(0);
});
it('should handle very large documents', async () => {
const largeContent = 'Large paragraph. '.repeat(1000);
const doc: RAGDocument = {
id: 'doc-large',
path: '/large.md',
content: largeContent,
metadata: {
title: 'Large',
lastModified: new Date().toISOString(),
size: largeContent.length
}
};
const chunks = await processor.chunk(doc, 200, 20);
expect(chunks.length).toBeGreaterThan(10);
chunks.forEach(chunk => {
expect(chunk.content.length).toBeLessThanOrEqual(220);
expect(chunk.documentId).toBe('doc-large');
});
});
it('should handle special markdown characters', async () => {
const content = `# Title with **bold** and *italic*
List:
- Item 1
- Item 2
> Blockquote here
[Link](https://example.com)`;
const doc = await processor.parse(content, '/special.md');
expect(doc.metadata.title).toBe('Title with **bold** and *italic*');
const chunks = await processor.chunk(doc, 100, 10);
const blockquoteChunk = chunks.find(c => c.content.includes('> Blockquote'));
expect(blockquoteChunk?.metadata.type).toBe('blockquote');
});
});
});