UNPKG

@andrejs1979/document

Version:

MongoDB-compatible document database for NoSQL

561 lines 21.8 kB
/** * NoSQL - Metadata and Tagging System * Intelligent document tagging and metadata management */ import { DocumentError } from '../types'; /** * Advanced tagging and metadata management system */ export class TaggingSystem { storage; config; tagCache = new Map(); tagHierarchy = new Map(); // tag -> parent tags tagStats = new Map(); constructor(storage, config) { this.storage = storage; this.config = config; } /** * Automatically tag a document based on its content */ async autoTag(collection, document, taggingConfig = {}) { try { const tags = new Set(); // Extract tags from different sources if (taggingConfig.tagSources?.includes('content') !== false) { const contentTags = this.extractContentTags(document); contentTags.forEach(tag => tags.add(tag)); } if (taggingConfig.tagSources?.includes('metadata') !== false) { const metadataTags = this.extractMetadataTags(document); metadataTags.forEach(tag => tags.add(tag)); } if (taggingConfig.tagSources?.includes('filename') !== false) { const filenameTags = this.extractFilenameTags(document); filenameTags.forEach(tag => tags.add(tag)); } // Apply custom tagger if provided if (taggingConfig.customTagger) { const customTags = taggingConfig.customTagger(document); customTags.forEach(tag => tags.add(tag)); } // Apply tag mapping if (taggingConfig.tagMapping) { const mappedTags = this.applyTagMapping(Array.from(tags), taggingConfig.tagMapping); mappedTags.forEach(tag => tags.add(tag)); } // Add hierarchical tags const finalTags = await this.addHierarchicalTags(Array.from(tags)); // Update tag statistics this.updateTagStats(finalTags); return finalTags; } catch (error) { throw new DocumentError(`Auto-tagging failed: ${error.message}`, 'AUTO_TAG_ERROR'); } } /** * Apply tags to a document */ async tagDocument(collection, documentId, tags, options = {}) { try { // Validate tags if requested if (options.validate !== false) { await this.validateTags(tags); } // Get existing document const document = await this.storage.findOne(collection, { _id: documentId }); if (!document) { throw new DocumentError(`Document ${documentId} not found`, 'DOCUMENT_NOT_FOUND'); } // Merge or replace tags const finalTags = options.merge !== false ? [...new Set([...(document.tags || []), ...tags])] : tags; // Update document await this.storage.updateOne(collection, { _id: documentId }, { $set: { tags: finalTags, lastTagged: new Date() } }); // Update tag statistics this.updateTagStats(finalTags); // Cache tags for the collection this.cacheCollectionTags(collection, finalTags); } catch (error) { throw new DocumentError(`Failed to tag document: ${error.message}`, 'TAG_DOCUMENT_ERROR'); } } /** * Remove tags from a document */ async untagDocument(collection, documentId, tagsToRemove) { try { await this.storage.updateOne(collection, { _id: documentId }, { $pullAll: { tags: tagsToRemove }, $set: { lastTagged: new Date() } }); } catch (error) { throw new DocumentError(`Failed to untag document: ${error.message}`, 'UNTAG_DOCUMENT_ERROR'); } } /** * Find documents by tags */ async findByTags(collection, tags, options = {}) { try { let searchTags = [...tags]; // Include hierarchical tags if requested if (options.includeHierarchy !== false) { searchTags = await this.expandTagsWithHierarchy(tags); } // Build query based on operator const filter = {}; if (options.operator === 'and') { filter.tags = { $all: searchTags }; } else { filter.tags = { $in: searchTags }; } return await this.storage.find(collection, filter, options.findOptions); } catch (error) { throw new DocumentError(`Find by tags failed: ${error.message}`, 'FIND_BY_TAGS_ERROR'); } } /** * Get tag suggestions for a document */ async getTagSuggestions(collection, document, options = {}) { try { const suggestions = new Set(); // Auto-generated suggestions const autoTags = await this.autoTag(collection, document); autoTags.forEach(tag => suggestions.add(tag)); // Popular tags in collection if (options.includePopular !== false) { const popularTags = await this.getPopularTags(collection, { limit: 10 }); popularTags.forEach(tag => suggestions.add(tag)); } // Tags from similar documents if (options.includeSimilar !== false) { const similarTags = await this.getSimilarDocumentTags(collection, document); similarTags.forEach(tag => suggestions.add(tag)); } // Score and sort suggestions const scoredSuggestions = await this.scoreSuggestions(Array.from(suggestions), collection, document); return scoredSuggestions .filter(s => s.score >= (options.threshold || 0.1)) .slice(0, options.limit || 20) .map(s => s.tag); } catch (error) { throw new DocumentError(`Tag suggestions failed: ${error.message}`, 'TAG_SUGGESTIONS_ERROR'); } } /** * Define tag hierarchy */ async defineTagHierarchy(parentTag, childTags) { try { // Update hierarchy map for (const childTag of childTags) { const parents = this.tagHierarchy.get(childTag) || []; if (!parents.includes(parentTag)) { parents.push(parentTag); this.tagHierarchy.set(childTag, parents); } } // Persist hierarchy await this.persistTagHierarchy(); } catch (error) { throw new DocumentError(`Failed to define tag hierarchy: ${error.message}`, 'TAG_HIERARCHY_ERROR'); } } /** * Get tag statistics for a collection */ async getTagStats(collection, options = {}) { try { // Aggregate tag usage from documents const pipeline = [ { $match: { _collection: collection } }, { $unwind: '$tags' }, { $group: { _id: '$tags', count: { $sum: 1 }, lastUsed: { $max: '$lastTagged' } } }, { $sort: this.getSortCriteria(options.sortBy) } ]; if (options.limit) { pipeline.push({ $limit: options.limit }); } // Execute aggregation (simplified for this implementation) const tagCounts = await this.getTagCountsFromDocuments(collection); const totalDocuments = await this.storage.countDocuments(collection, {}); const stats = Array.from(tagCounts.entries()).map(([tag, count]) => ({ tag, count, lastUsed: this.tagStats.get(tag)?.lastUsed || new Date(), percentage: totalDocuments > 0 ? (count / totalDocuments) * 100 : 0 })); // Sort based on criteria stats.sort((a, b) => { switch (options.sortBy) { case 'name': return a.tag.localeCompare(b.tag); case 'recent': return b.lastUsed.getTime() - a.lastUsed.getTime(); case 'count': default: return b.count - a.count; } }); return stats.slice(0, options.limit || 100); } catch (error) { throw new DocumentError(`Tag stats failed: ${error.message}`, 'TAG_STATS_ERROR'); } } /** * Clean up unused tags */ async cleanupUnusedTags(collection, options = {}) { try { const stats = await this.getTagStats(collection, { includeUnused: true }); const threshold = options.usageThreshold || 1; const cutoffDate = options.olderThan || new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago const tagsToRemove = stats.filter(stat => stat.count < threshold || stat.lastUsed < cutoffDate).map(stat => stat.tag); if (tagsToRemove.length === 0) { return { removedTags: [], documentsUpdated: 0 }; } // Remove tags from documents const updateResult = await this.storage.updateOne(collection, { tags: { $in: tagsToRemove } }, { $pullAll: { tags: tagsToRemove } }, { multi: true }); // Clean up tag statistics for (const tag of tagsToRemove) { this.tagStats.delete(tag); } return { removedTags: tagsToRemove, documentsUpdated: updateResult.modifiedCount }; } catch (error) { throw new DocumentError(`Tag cleanup failed: ${error.message}`, 'TAG_CLEANUP_ERROR'); } } /** * Bulk tag documents matching a filter */ async bulkTag(collection, filter, tags, options = {}) { try { const batchSize = options.batchSize || 1000; let totalUpdated = 0; const errors = []; // Process documents in batches let skip = 0; while (true) { const documents = await this.storage.find(collection, filter, { skip, limit: batchSize, projection: { _id: 1, tags: 1 } }); if (documents.length === 0) break; const bulkOps = documents.map(doc => { const finalTags = options.merge !== false ? [...new Set([...(doc.tags || []), ...tags])] : tags; return { updateOne: { filter: { _id: doc._id }, update: { $set: { tags: finalTags, lastTagged: new Date() } } } }; }); try { // This would use the bulk operations manager // For now, we'll update individually for (const doc of documents) { const finalTags = options.merge !== false ? [...new Set([...(doc.tags || []), ...tags])] : tags; await this.storage.updateOne(collection, { _id: doc._id }, { $set: { tags: finalTags, lastTagged: new Date() } }); totalUpdated++; } } catch (error) { errors.push(`Batch error: ${error.message}`); } skip += batchSize; } return { documentsUpdated: totalUpdated, errors }; } catch (error) { throw new DocumentError(`Bulk tagging failed: ${error.message}`, 'BULK_TAG_ERROR'); } } // =============================== // Private Methods // =============================== extractContentTags(document) { const tags = new Set(); // Extract tags from text content const text = this.extractText(document).toLowerCase(); // Common tag patterns const patterns = [ // Programming languages /\b(javascript|python|java|typescript|rust|go|cpp|php|ruby|swift)\b/g, // Technologies /\b(react|vue|angular|node|express|django|flask|spring|docker|kubernetes)\b/g, // Concepts /\b(algorithm|database|api|frontend|backend|devops|security|testing)\b/g ]; for (const pattern of patterns) { const matches = text.match(pattern); if (matches) { matches.forEach(match => tags.add(match)); } } // Extract hashtags const hashtags = text.match(/#(\w+)/g); if (hashtags) { hashtags.forEach(tag => tags.add(tag.substring(1))); } // Extract @mentions as tags const mentions = text.match(/@(\w+)/g); if (mentions) { mentions.forEach(mention => tags.add(`user:${mention.substring(1)}`)); } return Array.from(tags); } extractMetadataTags(document) { const tags = new Set(); // Extract from common metadata fields const metadataFields = ['category', 'type', 'status', 'priority', 'department']; for (const field of metadataFields) { const value = document[field]; if (typeof value === 'string') { tags.add(`${field}:${value.toLowerCase()}`); } else if (Array.isArray(value)) { value.forEach(v => { if (typeof v === 'string') { tags.add(`${field}:${v.toLowerCase()}`); } }); } } // Extract date-based tags if (document._createdAt) { const date = new Date(document._createdAt); tags.add(`year:${date.getFullYear()}`); tags.add(`month:${date.getFullYear()}-${String(date.getMonth() + 1).padStart(2, '0')}`); } return Array.from(tags); } extractFilenameTags(document) { const tags = new Set(); const filename = document.filename || document.name || document.title; if (typeof filename === 'string') { // Extract file extension const extension = filename.split('.').pop()?.toLowerCase(); if (extension && extension !== filename.toLowerCase()) { tags.add(`ext:${extension}`); } // Extract words from filename const words = filename .replace(/[^a-zA-Z0-9]/g, ' ') .split(' ') .filter(word => word.length > 2) .map(word => word.toLowerCase()); words.forEach(word => tags.add(word)); } return Array.from(tags); } applyTagMapping(tags, mapping) { const mappedTags = new Set(); for (const tag of tags) { if (mapping[tag]) { mapping[tag].forEach(mappedTag => mappedTags.add(mappedTag)); } else { mappedTags.add(tag); } } return Array.from(mappedTags); } async addHierarchicalTags(tags) { const allTags = new Set(tags); for (const tag of tags) { const parents = this.tagHierarchy.get(tag); if (parents) { parents.forEach(parent => allTags.add(parent)); } } return Array.from(allTags); } async expandTagsWithHierarchy(tags) { const expandedTags = new Set(tags); // Add child tags for each parent tag for (const [childTag, parents] of this.tagHierarchy.entries()) { if (parents.some(parent => tags.includes(parent))) { expandedTags.add(childTag); } } return Array.from(expandedTags); } updateTagStats(tags) { const now = new Date(); for (const tag of tags) { const current = this.tagStats.get(tag) || { count: 0, lastUsed: now }; this.tagStats.set(tag, { count: current.count + 1, lastUsed: now }); } } cacheCollectionTags(collection, tags) { const cached = this.tagCache.get(collection) || new Set(); tags.forEach(tag => cached.add(tag)); this.tagCache.set(collection, cached); } async validateTags(tags) { for (const tag of tags) { if (typeof tag !== 'string' || tag.trim().length === 0) { throw new DocumentError(`Invalid tag: '${tag}'`, 'INVALID_TAG'); } if (tag.length > 50) { throw new DocumentError(`Tag too long: '${tag}' (max 50 characters)`, 'TAG_TOO_LONG'); } if (!/^[a-zA-Z0-9:._-]+$/.test(tag)) { throw new DocumentError(`Invalid tag format: '${tag}'`, 'INVALID_TAG_FORMAT'); } } } async getPopularTags(collection, options = {}) { const stats = await this.getTagStats(collection, { limit: options.limit || 10, sortBy: 'count' }); return stats.map(stat => stat.tag); } async getSimilarDocumentTags(collection, document) { // Find documents with similar content/metadata const filter = {}; if (document.category) { filter.category = document.category; } if (document.type) { filter.type = document.type; } const similarDocs = await this.storage.find(collection, filter, { limit: 10 }); const tags = new Set(); for (const doc of similarDocs) { if (doc.tags && Array.isArray(doc.tags)) { doc.tags.forEach(tag => tags.add(tag)); } } return Array.from(tags); } async scoreSuggestions(suggestions, collection, document) { const scored = suggestions.map(tag => { let score = 0.5; // Base score // Boost score based on tag popularity const stats = this.tagStats.get(tag); if (stats) { score += Math.min(stats.count / 100, 0.3); // Max 0.3 boost } // Boost score for recent usage if (stats && stats.lastUsed > new Date(Date.now() - 7 * 24 * 60 * 60 * 1000)) { score += 0.2; } // Boost score for content relevance const text = this.extractText(document).toLowerCase(); if (text.includes(tag.toLowerCase())) { score += 0.3; } return { tag, score: Math.min(score, 1.0) }; }); return scored.sort((a, b) => b.score - a.score); } extractText(document) { const textFields = ['title', 'content', 'description', 'text', 'name', 'summary']; const texts = []; const extractTextRecursive = (obj, depth = 0) => { if (depth > 3) return; for (const [key, value] of Object.entries(obj)) { if (typeof value === 'string' && (textFields.includes(key) || key.includes('text'))) { texts.push(value); } else if (typeof value === 'object' && value !== null && !Array.isArray(value)) { extractTextRecursive(value, depth + 1); } } }; extractTextRecursive(document); return texts.join(' '); } async getTagCountsFromDocuments(collection) { // Simplified implementation - in a real scenario, this would use aggregation const documents = await this.storage.find(collection, { tags: { $exists: true } }); const tagCounts = new Map(); for (const doc of documents) { if (doc.tags && Array.isArray(doc.tags)) { for (const tag of doc.tags) { tagCounts.set(tag, (tagCounts.get(tag) || 0) + 1); } } } return tagCounts; } getSortCriteria(sortBy) { switch (sortBy) { case 'name': return { _id: 1 }; case 'recent': return { lastUsed: -1 }; case 'count': default: return { count: -1 }; } } async persistTagHierarchy() { // Store tag hierarchy in the database const hierarchyDoc = { _id: 'tag_hierarchy', hierarchy: Object.fromEntries(this.tagHierarchy), updatedAt: new Date() }; try { await this.storage.updateOne('_system_metadata', { _id: 'tag_hierarchy' }, { $set: hierarchyDoc }, { upsert: true }); } catch (error) { console.warn('Failed to persist tag hierarchy:', error.message); } } } //# sourceMappingURL=tagging-system.js.map