@a24z/markdown-search
Version:
High-performance full-text search for markdown documents
292 lines • 11.4 kB
JavaScript
;
/**
* DocumentIndexer - Converts parsed markdown documents into searchable documents
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.DocumentIndexer = void 0;
// Import from @a24z/markdown-utils package
const markdown_utils_1 = require("@a24z/markdown-utils");
class DocumentIndexer {
/**
* Create searchable documents from a markdown document
*/
createSearchDocuments(document, fileInfo, options) {
const documents = [];
// Index the entire document as one searchable item
const mainDoc = {
id: `${fileInfo.uri || fileInfo.path}#document`,
type: 'document',
fileUri: fileInfo.uri || fileInfo.path,
fileName: fileInfo.name,
filePath: fileInfo.path,
content: document.content,
title: document.title || fileInfo.name,
location: {
startLine: 0,
endLine: document.content.split('\n').length,
},
metadata: this.analyzeDocumentContent(document),
contentHash: this.generateContentHash(document.content),
indexedAt: new Date().toISOString(),
};
documents.push(mainDoc);
// Index individual sections
document.sections?.forEach((section, sectionIndex) => {
const sectionDoc = {
id: `${fileInfo.uri || fileInfo.path}#section-${sectionIndex}`,
type: 'section',
parentId: mainDoc.id,
fileUri: fileInfo.uri || fileInfo.path,
fileName: fileInfo.name,
filePath: fileInfo.path,
content: section.content,
title: section.title || `Section ${sectionIndex + 1}`,
location: {
startLine: section.startLine || 0,
endLine: section.endLine || 0,
},
sectionIndex: sectionIndex,
sectionNumber: sectionIndex + 1,
sectionLevel: section.level,
totalSectionsInFile: document.sections?.length || 0,
previousSectionTitle: sectionIndex > 0 ? document.sections?.[sectionIndex - 1]?.title : undefined,
nextSectionTitle: sectionIndex < (document.sections?.length || 0) - 1
? document.sections?.[sectionIndex + 1]?.title
: undefined,
metadata: this.analyzeSectionContent(section),
contentHash: this.generateContentHash(section.content),
indexedAt: new Date().toISOString(),
};
documents.push(sectionDoc);
// Index individual blocks within sections if requested
if (options?.indexChunks && section.blocks) {
section.blocks.forEach((block, blockIndex) => {
const blockDoc = this.createBlockDocument(block, blockIndex, sectionDoc, fileInfo);
if (blockDoc) {
documents.push(blockDoc);
}
});
}
});
return documents;
}
/**
* Parse markdown content and create searchable documents
*/
async parseAndIndex(content, fileInfo, options) {
// Use the presentation parser and adapt it to our document structure
const presentation = (0, markdown_utils_1.parseMarkdownIntoPresentation)(content);
// Convert presentation to document structure
const document = {
content: content,
title: fileInfo.name.replace(/\.md$/i, ''),
sections: presentation.slides.map((slide) => ({
content: slide.location.content,
title: slide.title,
level: 1,
startLine: slide.location.startLine,
endLine: slide.location.endLine,
blocks: slide.chunks,
})),
};
// Clear the presentation object to free memory
presentation.slides = [];
const documents = this.createSearchDocuments(document, fileInfo, options);
// Clear document sections to free memory after creating search documents
document.sections = [];
return documents;
}
/**
* Analyze document content to extract metadata
*/
analyzeDocumentContent(document) {
const content = document.content;
const metadata = {
hasCode: false,
hasMermaid: false,
hasTables: false,
hasImages: false,
hasLinks: false,
codeLanguages: [],
};
// Check for code blocks
const codeBlockRegex = /```(\w+)?/g;
let codeMatch;
while ((codeMatch = codeBlockRegex.exec(content)) !== null) {
metadata.hasCode = true;
if (codeMatch[1] && !metadata.codeLanguages.includes(codeMatch[1])) {
metadata.codeLanguages.push(codeMatch[1]);
}
}
// Check for mermaid diagrams
if (/```mermaid/i.test(content)) {
metadata.hasMermaid = true;
}
// Check for tables
if (/\|.+\|/.test(content) && /\|[-:]+\|/.test(content)) {
metadata.hasTables = true;
}
// Check for images
if (/!\[.*?\]\(.*?\)/.test(content)) {
metadata.hasImages = true;
}
// Check for links (excluding images)
if (/(?<!!)\[.*?\]\(.*?\)/.test(content)) {
metadata.hasLinks = true;
}
return metadata;
}
/**
* Analyze section content to extract metadata
*/
analyzeSectionContent(section) {
return this.analyzeDocumentContent({ content: section.content });
}
/**
* Create a searchable document from a content block
*/
createBlockDocument(block, blockIndex, parentSectionDoc, fileInfo) {
// Determine block type and extract content
let blockType;
let content;
let language;
let diagramType;
// Handle chunk types from @a24z/markdown-utils
switch (block.type) {
case markdown_utils_1.CHUNK_TYPES.MERMAID:
case 'mermaid_chunk':
blockType = 'mermaid';
content = block.code || block.content || '';
diagramType = 'mermaid';
break;
case markdown_utils_1.CHUNK_TYPES.CODE:
case 'code_chunk':
blockType = 'code';
content = block.content || block.code || '';
language = block.language;
break;
case markdown_utils_1.CHUNK_TYPES.MARKDOWN:
case 'markdown_chunk':
blockType = 'paragraph';
content = block.content || '';
break;
default:
// Try to handle other types
if (block.type === 'heading') {
blockType = 'heading';
content = block.content || '';
}
else if (block.type === 'list') {
blockType = 'list';
content = block.content || '';
}
else if (block.type === 'table') {
blockType = 'table';
content = block.content || '';
}
else {
// Skip unknown block types
return null;
}
}
// Don't index empty blocks
if (!content || !content.trim()) {
return null;
}
const blockDoc = {
// Base SearchableDocument fields
id: `${parentSectionDoc.id}#block-${blockIndex}`,
type: blockType,
parentId: parentSectionDoc.id,
fileUri: fileInfo.uri || fileInfo.path,
fileName: fileInfo.name,
filePath: fileInfo.path,
content: content,
title: this.generateBlockTitle(blockType, content),
// Location information
location: {
startLine: block.startLine || parentSectionDoc.location?.startLine || 0,
endLine: block.endLine || parentSectionDoc.location?.endLine || 0,
},
// Type-specific fields
sectionIndex: parentSectionDoc.sectionIndex,
language: language,
diagramType: diagramType,
// Metadata
metadata: {
parentSectionTitle: parentSectionDoc.title || '',
parentSectionId: parentSectionDoc.id || '',
blockIndex: blockIndex,
},
// Search optimization
boost: this.getBlockBoost(blockType),
tags: this.generateBlockTags(blockType, language),
};
return blockDoc;
}
/**
* Generate a title for a block document
*/
generateBlockTitle(blockType, content) {
const maxLength = 50;
const firstLine = content.split('\n')[0].trim();
switch (blockType) {
case 'code':
return `Code: ${firstLine.length > maxLength ? firstLine.substring(0, maxLength - 3) + '...' : firstLine}`;
case 'mermaid':
return `Diagram: ${firstLine.length > maxLength ? firstLine.substring(0, maxLength - 3) + '...' : firstLine}`;
case 'heading':
return `Heading: ${firstLine.length > maxLength ? firstLine.substring(0, maxLength - 3) + '...' : firstLine}`;
case 'table':
return `Table: ${firstLine.length > maxLength ? firstLine.substring(0, maxLength - 3) + '...' : firstLine}`;
case 'list':
return `List: ${firstLine.length > maxLength ? firstLine.substring(0, maxLength - 3) + '...' : firstLine}`;
case 'paragraph':
default:
return firstLine.length > maxLength
? firstLine.substring(0, maxLength - 3) + '...'
: firstLine;
}
}
/**
* Get boost factor for different block types
*/
getBlockBoost(blockType) {
switch (blockType) {
case 'heading':
return 1.5; // Headings are most important
case 'code':
return 1.2; // Code blocks are important
case 'mermaid':
return 1.1; // Diagrams are also valuable
case 'table':
return 1.1; // Tables contain structured data
default:
return 1.0;
}
}
/**
* Generate tags for block documents
*/
generateBlockTags(blockType, language) {
const tags = [blockType];
if (language) {
tags.push(language);
}
return tags;
}
/**
* Generate a simple content hash for change detection
*/
generateContentHash(content) {
let hash = 0;
for (let i = 0; i < content.length; i++) {
const char = content.charCodeAt(i);
hash = (hash << 5) - hash + char;
hash = hash & hash; // Convert to 32-bit integer
}
return Math.abs(hash).toString(36);
}
}
exports.DocumentIndexer = DocumentIndexer;
//# sourceMappingURL=DocumentIndexer.js.map