@boundless-oss/atlas
Version:
Atlas - MCP Server for comprehensive startup project management
381 lines (340 loc) • 12.1 kB
text/typescript
import { promises as fs } from 'fs';
import path from 'path';
import type {
RAGDocument,
RAGChunk,
DocumentMetadata,
ChunkMetadata,
DocumentProcessor
} from './types.js';
export class MarkdownDocumentProcessor implements DocumentProcessor {
private chunkIdCounter = 0;
async parse(content: string, filePath: string): Promise<RAGDocument> {
const metadata = await this.extractMetadata(content, filePath);
return {
id: `doc-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
path: filePath,
content,
metadata
};
}
async chunk(document: RAGDocument, chunkSize: number, overlap: number): Promise<RAGChunk[]> {
const chunks: RAGChunk[] = [];
const lines = document.content.split('\n');
let currentChunk = '';
let currentIndex = 0;
let startOffset = 0;
let inCodeBlock = false;
let codeBlockContent = '';
let codeBlockLanguage = '';
let codeBlockStart = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const lineWithNewline = line + '\n';
// Handle code blocks
if (line.trim().startsWith('```')) {
if (!inCodeBlock) {
// Starting a code block
if (currentChunk.trim()) {
// Save current chunk before code block
chunks.push(this.createChunk(
document.id,
currentChunk.trim(),
currentIndex++,
startOffset,
startOffset + currentChunk.length,
this.detectChunkType(currentChunk)
));
}
inCodeBlock = true;
codeBlockContent = lineWithNewline;
codeBlockLanguage = line.trim().slice(3);
codeBlockStart = startOffset + currentChunk.length;
currentChunk = '';
} else {
// Ending a code block
codeBlockContent += lineWithNewline;
chunks.push(this.createChunk(
document.id,
codeBlockContent.trim(),
currentIndex++,
codeBlockStart,
codeBlockStart + codeBlockContent.length,
{
type: 'code',
language: codeBlockLanguage || undefined
}
));
inCodeBlock = false;
codeBlockContent = '';
codeBlockLanguage = '';
startOffset = codeBlockStart + codeBlockContent.length;
currentChunk = '';
}
continue;
}
if (inCodeBlock) {
codeBlockContent += lineWithNewline;
continue;
}
// Handle headers
if (line.trim().match(/^#+\s/)) {
// Save current chunk if any
if (currentChunk.trim()) {
chunks.push(this.createChunk(
document.id,
currentChunk.trim(),
currentIndex++,
startOffset,
startOffset + currentChunk.length,
this.detectChunkType(currentChunk)
));
}
// Create header chunk
const headerLevel = line.match(/^(#+)/)?.[1].length || 1;
chunks.push(this.createChunk(
document.id,
line.trim(),
currentIndex++,
startOffset + currentChunk.length,
startOffset + currentChunk.length + line.length,
{ type: 'header', level: headerLevel }
));
startOffset = startOffset + currentChunk.length + lineWithNewline.length;
currentChunk = '';
continue;
}
// Handle empty lines (paragraph breaks)
if (line.trim() === '') {
if (currentChunk.trim()) {
chunks.push(this.createChunk(
document.id,
currentChunk.trim(),
currentIndex++,
startOffset,
startOffset + currentChunk.length,
this.detectChunkType(currentChunk)
));
startOffset = startOffset + currentChunk.length + lineWithNewline.length;
currentChunk = '';
} else {
startOffset += lineWithNewline.length;
}
continue;
}
// Add line to current chunk
const potentialChunk = currentChunk + lineWithNewline;
if (potentialChunk.length > chunkSize) {
if (currentChunk.length > 0) {
// Current chunk is full, save it
chunks.push(this.createChunk(
document.id,
currentChunk.trim(),
currentIndex++,
startOffset,
startOffset + currentChunk.length,
this.detectChunkType(currentChunk)
));
// Handle overlap
if (overlap > 0) {
const words = currentChunk.trim().split(/\s+/);
const overlapWords = Math.ceil(overlap / 5); // Approximate words for overlap
const overlapText = words.slice(-overlapWords).join(' ');
currentChunk = overlapText + ' ' + lineWithNewline;
startOffset = startOffset + currentChunk.length - overlapText.length - 1;
} else {
currentChunk = lineWithNewline;
startOffset = startOffset + currentChunk.length;
}
} else {
// Single line is too long, need to split it
const words = line.split(/\s+/);
let tempChunk = '';
let wordIndex = 0;
while (wordIndex < words.length) {
const nextWord = words[wordIndex];
const potentialTemp = tempChunk + (tempChunk ? ' ' : '') + nextWord;
if (potentialTemp.length > chunkSize && tempChunk) {
// Save current temp chunk
chunks.push(this.createChunk(
document.id,
tempChunk.trim(),
currentIndex++,
startOffset,
startOffset + tempChunk.length,
this.detectChunkType(tempChunk)
));
startOffset += tempChunk.length + 1;
tempChunk = nextWord;
} else {
tempChunk = potentialTemp;
}
wordIndex++;
}
// Set remaining words as current chunk
currentChunk = tempChunk + '\n';
}
} else {
currentChunk = potentialChunk;
}
}
// Save any remaining chunk
if (currentChunk.trim()) {
chunks.push(this.createChunk(
document.id,
currentChunk.trim(),
currentIndex++,
startOffset,
startOffset + currentChunk.length,
this.detectChunkType(currentChunk)
));
}
return chunks;
}
async extractMetadata(content: string, filePath: string): Promise<DocumentMetadata> {
const fileName = path.basename(filePath, path.extname(filePath));
// Default metadata
let metadata: DocumentMetadata = {
title: fileName,
lastModified: new Date().toISOString(),
size: content.length
};
// Extract frontmatter if present
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
if (frontmatterMatch) {
try {
const frontmatterContent = frontmatterMatch[1];
const frontmatterData = this.parseFrontmatter(frontmatterContent);
if (frontmatterData.title) metadata.title = frontmatterData.title;
if (frontmatterData.author) metadata.author = frontmatterData.author;
if (frontmatterData.tags) metadata.tags = Array.isArray(frontmatterData.tags) ? frontmatterData.tags : [frontmatterData.tags];
if (frontmatterData.version) metadata.version = String(frontmatterData.version);
if (frontmatterData.custom) metadata.custom = frontmatterData.custom;
// Copy any other frontmatter fields to custom
const knownFields = ['title', 'author', 'tags', 'version', 'custom'];
const customFields: Record<string, any> = {};
for (const [key, value] of Object.entries(frontmatterData)) {
if (!knownFields.includes(key)) {
customFields[key] = value;
}
}
if (Object.keys(customFields).length > 0) {
metadata.custom = { ...metadata.custom, ...customFields };
}
} catch (e) {
// Ignore frontmatter parsing errors
}
}
// If no title from frontmatter, try to extract from first header
if (metadata.title === fileName) {
const headerMatch = content.match(/^#+\s+(.+)$/m);
if (headerMatch) {
metadata.title = headerMatch[1].trim();
}
}
return metadata;
}
private createChunk(
documentId: string,
content: string,
index: number,
startOffset: number,
endOffset: number,
metadata: Partial<ChunkMetadata>
): RAGChunk {
return {
id: `chunk-${documentId}-${this.chunkIdCounter++}`,
documentId,
content,
index,
metadata: {
startOffset,
endOffset,
type: 'paragraph',
...metadata
} as ChunkMetadata
};
}
private detectChunkType(content: string): Partial<ChunkMetadata> {
const trimmed = content.trim();
if (trimmed.startsWith('>')) {
return { type: 'blockquote' };
}
if (trimmed.match(/^[-*+]\s/) || trimmed.match(/^\d+\.\s/)) {
return { type: 'list' };
}
return { type: 'paragraph' };
}
private parseFrontmatter(content: string): Record<string, any> {
const result: Record<string, any> = {};
const lines = content.split('\n');
let currentIndent = 0;
let currentKey: string | null = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmedLine = line.trim();
// Skip empty lines
if (!trimmedLine) continue;
// Check for key-value pairs
const match = line.match(/^(\s*)(\w+):\s*(.*)$/);
if (match) {
const [, indent, key, value] = match;
currentIndent = indent.length;
currentKey = key;
// Try to parse arrays in square brackets
if (value.startsWith('[') && value.endsWith(']')) {
try {
// Handle special case: tags: [tutorial, guide] without quotes
if (key === 'tags' && !value.includes('"') && !value.includes("'")) {
result[key] = value.slice(1, -1).split(',').map(s => s.trim());
} else {
result[key] = JSON.parse(value);
}
} catch {
result[key] = value;
}
}
// Handle YAML-style arrays (value is empty, next line starts with -)
else if (value.trim() === '' && i + 1 < lines.length && lines[i + 1].trim().startsWith('-')) {
result[key] = [];
}
// Handle nested objects
else if (value.trim() === '' && i + 1 < lines.length && lines[i + 1].match(/^\s+\w+:/)) {
result[key] = {};
// Process nested object
let j = i + 1;
while (j < lines.length && lines[j].match(/^\s+/)) {
const nestedMatch = lines[j].match(/^\s+(\w+):\s*(.*)$/);
if (nestedMatch) {
const [, nestedKey, nestedValue] = nestedMatch;
result[key][nestedKey] = nestedValue;
}
j++;
}
i = j - 1; // Skip processed lines
}
// Handle empty values
else if (value.trim() === '') {
// Don't set undefined values
}
// Handle numbers
else if (/^\d+(\.\d+)?$/.test(value)) {
result[key] = parseFloat(value);
}
// Handle booleans
else if (value === 'true' || value === 'false') {
result[key] = value === 'true';
}
// Everything else is a string
else {
result[key] = value;
}
}
// Handle multi-line arrays
else if (trimmedLine.startsWith('-') && currentKey && Array.isArray(result[currentKey])) {
result[currentKey].push(trimmedLine.slice(1).trim());
}
}
return result;
}
}