@boundless-oss/atlas

import { promises as fs } from 'fs'; import path from 'path'; import type { RAGDocument, RAGChunk, DocumentMetadata, ChunkMetadata, DocumentProcessor } from './types.js'; export class MarkdownDocumentProcessor implements DocumentProcessor { private chunkIdCounter = 0; async parse(content: string, filePath: string): Promise<RAGDocument> { const metadata = await this.extractMetadata(content, filePath); return { id: `doc-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`, path: filePath, content, metadata }; } async chunk(document: RAGDocument, chunkSize: number, overlap: number): Promise<RAGChunk[]> { const chunks: RAGChunk[] = []; const lines = document.content.split('\n'); let currentChunk = ''; let currentIndex = 0; let startOffset = 0; let inCodeBlock = false; let codeBlockContent = ''; let codeBlockLanguage = ''; let codeBlockStart = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineWithNewline = line + '\n'; // Handle code blocks if (line.trim().startsWith('```')) { if (!inCodeBlock) { // Starting a code block if (currentChunk.trim()) { // Save current chunk before code block chunks.push(this.createChunk( document.id, currentChunk.trim(), currentIndex++, startOffset, startOffset + currentChunk.length, this.detectChunkType(currentChunk) )); } inCodeBlock = true; codeBlockContent = lineWithNewline; codeBlockLanguage = line.trim().slice(3); codeBlockStart = startOffset + currentChunk.length; currentChunk = ''; } else { // Ending a code block codeBlockContent += lineWithNewline; chunks.push(this.createChunk( document.id, codeBlockContent.trim(), currentIndex++, codeBlockStart, codeBlockStart + codeBlockContent.length, { type: 'code', language: codeBlockLanguage || undefined } )); inCodeBlock = false; codeBlockContent = ''; codeBlockLanguage = ''; startOffset = codeBlockStart + codeBlockContent.length; currentChunk = ''; } continue; } if (inCodeBlock) { codeBlockContent += lineWithNewline; continue; } // Handle headers if (line.trim().match(/^#+\s/)) { // Save current chunk if any if (currentChunk.trim()) { chunks.push(this.createChunk( document.id, currentChunk.trim(), currentIndex++, startOffset, startOffset + currentChunk.length, this.detectChunkType(currentChunk) )); } // Create header chunk const headerLevel = line.match(/^(#+)/)?.[1].length || 1; chunks.push(this.createChunk( document.id, line.trim(), currentIndex++, startOffset + currentChunk.length, startOffset + currentChunk.length + line.length, { type: 'header', level: headerLevel } )); startOffset = startOffset + currentChunk.length + lineWithNewline.length; currentChunk = ''; continue; } // Handle empty lines (paragraph breaks) if (line.trim() === '') { if (currentChunk.trim()) { chunks.push(this.createChunk( document.id, currentChunk.trim(), currentIndex++, startOffset, startOffset + currentChunk.length, this.detectChunkType(currentChunk) )); startOffset = startOffset + currentChunk.length + lineWithNewline.length; currentChunk = ''; } else { startOffset += lineWithNewline.length; } continue; } // Add line to current chunk const potentialChunk = currentChunk + lineWithNewline; if (potentialChunk.length > chunkSize) { if (currentChunk.length > 0) { // Current chunk is full, save it chunks.push(this.createChunk( document.id, currentChunk.trim(), currentIndex++, startOffset, startOffset + currentChunk.length, this.detectChunkType(currentChunk) )); // Handle overlap if (overlap > 0) { const words = currentChunk.trim().split(/\s+/); const overlapWords = Math.ceil(overlap / 5); // Approximate words for overlap const overlapText = words.slice(-overlapWords).join(' '); currentChunk = overlapText + ' ' + lineWithNewline; startOffset = startOffset + currentChunk.length - overlapText.length - 1; } else { currentChunk = lineWithNewline; startOffset = startOffset + currentChunk.length; } } else { // Single line is too long, need to split it const words = line.split(/\s+/); let tempChunk = ''; let wordIndex = 0; while (wordIndex < words.length) { const nextWord = words[wordIndex]; const potentialTemp = tempChunk + (tempChunk ? ' ' : '') + nextWord; if (potentialTemp.length > chunkSize && tempChunk) { // Save current temp chunk chunks.push(this.createChunk( document.id, tempChunk.trim(), currentIndex++, startOffset, startOffset + tempChunk.length, this.detectChunkType(tempChunk) )); startOffset += tempChunk.length + 1; tempChunk = nextWord; } else { tempChunk = potentialTemp; } wordIndex++; } // Set remaining words as current chunk currentChunk = tempChunk + '\n'; } } else { currentChunk = potentialChunk; } } // Save any remaining chunk if (currentChunk.trim()) { chunks.push(this.createChunk( document.id, currentChunk.trim(), currentIndex++, startOffset, startOffset + currentChunk.length, this.detectChunkType(currentChunk) )); } return chunks; } async extractMetadata(content: string, filePath: string): Promise<DocumentMetadata> { const fileName = path.basename(filePath, path.extname(filePath)); // Default metadata let metadata: DocumentMetadata = { title: fileName, lastModified: new Date().toISOString(), size: content.length }; // Extract frontmatter if present const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/); if (frontmatterMatch) { try { const frontmatterContent = frontmatterMatch[1]; const frontmatterData = this.parseFrontmatter(frontmatterContent); if (frontmatterData.title) metadata.title = frontmatterData.title; if (frontmatterData.author) metadata.author = frontmatterData.author; if (frontmatterData.tags) metadata.tags = Array.isArray(frontmatterData.tags) ? frontmatterData.tags : [frontmatterData.tags]; if (frontmatterData.version) metadata.version = String(frontmatterData.version); if (frontmatterData.custom) metadata.custom = frontmatterData.custom; // Copy any other frontmatter fields to custom const knownFields = ['title', 'author', 'tags', 'version', 'custom']; const customFields: Record<string, any> = {}; for (const [key, value] of Object.entries(frontmatterData)) { if (!knownFields.includes(key)) { customFields[key] = value; } } if (Object.keys(customFields).length > 0) { metadata.custom = { ...metadata.custom, ...customFields }; } } catch (e) { // Ignore frontmatter parsing errors } } // If no title from frontmatter, try to extract from first header if (metadata.title === fileName) { const headerMatch = content.match(/^#+\s+(.+)$/m); if (headerMatch) { metadata.title = headerMatch[1].trim(); } } return metadata; } private createChunk( documentId: string, content: string, index: number, startOffset: number, endOffset: number, metadata: Partial<ChunkMetadata> ): RAGChunk { return { id: `chunk-${documentId}-${this.chunkIdCounter++}`, documentId, content, index, metadata: { startOffset, endOffset, type: 'paragraph', ...metadata } as ChunkMetadata }; } private detectChunkType(content: string): Partial<ChunkMetadata> { const trimmed = content.trim(); if (trimmed.startsWith('>')) { return { type: 'blockquote' }; } if (trimmed.match(/^[-*+]\s/) || trimmed.match(/^\d+\.\s/)) { return { type: 'list' }; } return { type: 'paragraph' }; } private parseFrontmatter(content: string): Record<string, any> { const result: Record<string, any> = {}; const lines = content.split('\n'); let currentIndent = 0; let currentKey: string | null = null; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const trimmedLine = line.trim(); // Skip empty lines if (!trimmedLine) continue; // Check for key-value pairs const match = line.match(/^(\s*)(\w+):\s*(.*)$/); if (match) { const [, indent, key, value] = match; currentIndent = indent.length; currentKey = key; // Try to parse arrays in square brackets if (value.startsWith('[') && value.endsWith(']')) { try { // Handle special case: tags: [tutorial, guide] without quotes if (key === 'tags' && !value.includes('"') && !value.includes("'")) { result[key] = value.slice(1, -1).split(',').map(s => s.trim()); } else { result[key] = JSON.parse(value); } } catch { result[key] = value; } } // Handle YAML-style arrays (value is empty, next line starts with -) else if (value.trim() === '' && i + 1 < lines.length && lines[i + 1].trim().startsWith('-')) { result[key] = []; } // Handle nested objects else if (value.trim() === '' && i + 1 < lines.length && lines[i + 1].match(/^\s+\w+:/)) { result[key] = {}; // Process nested object let j = i + 1; while (j < lines.length && lines[j].match(/^\s+/)) { const nestedMatch = lines[j].match(/^\s+(\w+):\s*(.*)$/); if (nestedMatch) { const [, nestedKey, nestedValue] = nestedMatch; result[key][nestedKey] = nestedValue; } j++; } i = j - 1; // Skip processed lines } // Handle empty values else if (value.trim() === '') { // Don't set undefined values } // Handle numbers else if (/^\d+(\.\d+)?$/.test(value)) { result[key] = parseFloat(value); } // Handle booleans else if (value === 'true' || value === 'false') { result[key] = value === 'true'; } // Everything else is a string else { result[key] = value; } } // Handle multi-line arrays else if (trimmedLine.startsWith('-') && currentKey && Array.isArray(result[currentKey])) { result[currentKey].push(trimmedLine.slice(1).trim()); } } return result; } }