@andrejs1979/document
Version:
MongoDB-compatible document database for NoSQL
561 lines • 21.8 kB
JavaScript
/**
* NoSQL - Metadata and Tagging System
* Intelligent document tagging and metadata management
*/
import { DocumentError } from '../types';
/**
* Advanced tagging and metadata management system
*/
export class TaggingSystem {
storage;
config;
tagCache = new Map();
tagHierarchy = new Map(); // tag -> parent tags
tagStats = new Map();
constructor(storage, config) {
this.storage = storage;
this.config = config;
}
/**
* Automatically tag a document based on its content
*/
async autoTag(collection, document, taggingConfig = {}) {
try {
const tags = new Set();
// Extract tags from different sources
if (taggingConfig.tagSources?.includes('content') !== false) {
const contentTags = this.extractContentTags(document);
contentTags.forEach(tag => tags.add(tag));
}
if (taggingConfig.tagSources?.includes('metadata') !== false) {
const metadataTags = this.extractMetadataTags(document);
metadataTags.forEach(tag => tags.add(tag));
}
if (taggingConfig.tagSources?.includes('filename') !== false) {
const filenameTags = this.extractFilenameTags(document);
filenameTags.forEach(tag => tags.add(tag));
}
// Apply custom tagger if provided
if (taggingConfig.customTagger) {
const customTags = taggingConfig.customTagger(document);
customTags.forEach(tag => tags.add(tag));
}
// Apply tag mapping
if (taggingConfig.tagMapping) {
const mappedTags = this.applyTagMapping(Array.from(tags), taggingConfig.tagMapping);
mappedTags.forEach(tag => tags.add(tag));
}
// Add hierarchical tags
const finalTags = await this.addHierarchicalTags(Array.from(tags));
// Update tag statistics
this.updateTagStats(finalTags);
return finalTags;
}
catch (error) {
throw new DocumentError(`Auto-tagging failed: ${error.message}`, 'AUTO_TAG_ERROR');
}
}
/**
* Apply tags to a document
*/
async tagDocument(collection, documentId, tags, options = {}) {
try {
// Validate tags if requested
if (options.validate !== false) {
await this.validateTags(tags);
}
// Get existing document
const document = await this.storage.findOne(collection, { _id: documentId });
if (!document) {
throw new DocumentError(`Document ${documentId} not found`, 'DOCUMENT_NOT_FOUND');
}
// Merge or replace tags
const finalTags = options.merge !== false
? [...new Set([...(document.tags || []), ...tags])]
: tags;
// Update document
await this.storage.updateOne(collection, { _id: documentId }, {
$set: {
tags: finalTags,
lastTagged: new Date()
}
});
// Update tag statistics
this.updateTagStats(finalTags);
// Cache tags for the collection
this.cacheCollectionTags(collection, finalTags);
}
catch (error) {
throw new DocumentError(`Failed to tag document: ${error.message}`, 'TAG_DOCUMENT_ERROR');
}
}
/**
* Remove tags from a document
*/
async untagDocument(collection, documentId, tagsToRemove) {
try {
await this.storage.updateOne(collection, { _id: documentId }, {
$pullAll: { tags: tagsToRemove },
$set: { lastTagged: new Date() }
});
}
catch (error) {
throw new DocumentError(`Failed to untag document: ${error.message}`, 'UNTAG_DOCUMENT_ERROR');
}
}
/**
* Find documents by tags
*/
async findByTags(collection, tags, options = {}) {
try {
let searchTags = [...tags];
// Include hierarchical tags if requested
if (options.includeHierarchy !== false) {
searchTags = await this.expandTagsWithHierarchy(tags);
}
// Build query based on operator
const filter = {};
if (options.operator === 'and') {
filter.tags = { $all: searchTags };
}
else {
filter.tags = { $in: searchTags };
}
return await this.storage.find(collection, filter, options.findOptions);
}
catch (error) {
throw new DocumentError(`Find by tags failed: ${error.message}`, 'FIND_BY_TAGS_ERROR');
}
}
/**
* Get tag suggestions for a document
*/
async getTagSuggestions(collection, document, options = {}) {
try {
const suggestions = new Set();
// Auto-generated suggestions
const autoTags = await this.autoTag(collection, document);
autoTags.forEach(tag => suggestions.add(tag));
// Popular tags in collection
if (options.includePopular !== false) {
const popularTags = await this.getPopularTags(collection, { limit: 10 });
popularTags.forEach(tag => suggestions.add(tag));
}
// Tags from similar documents
if (options.includeSimilar !== false) {
const similarTags = await this.getSimilarDocumentTags(collection, document);
similarTags.forEach(tag => suggestions.add(tag));
}
// Score and sort suggestions
const scoredSuggestions = await this.scoreSuggestions(Array.from(suggestions), collection, document);
return scoredSuggestions
.filter(s => s.score >= (options.threshold || 0.1))
.slice(0, options.limit || 20)
.map(s => s.tag);
}
catch (error) {
throw new DocumentError(`Tag suggestions failed: ${error.message}`, 'TAG_SUGGESTIONS_ERROR');
}
}
/**
* Define tag hierarchy
*/
async defineTagHierarchy(parentTag, childTags) {
try {
// Update hierarchy map
for (const childTag of childTags) {
const parents = this.tagHierarchy.get(childTag) || [];
if (!parents.includes(parentTag)) {
parents.push(parentTag);
this.tagHierarchy.set(childTag, parents);
}
}
// Persist hierarchy
await this.persistTagHierarchy();
}
catch (error) {
throw new DocumentError(`Failed to define tag hierarchy: ${error.message}`, 'TAG_HIERARCHY_ERROR');
}
}
/**
* Get tag statistics for a collection
*/
async getTagStats(collection, options = {}) {
try {
// Aggregate tag usage from documents
const pipeline = [
{ $match: { _collection: collection } },
{ $unwind: '$tags' },
{
$group: {
_id: '$tags',
count: { $sum: 1 },
lastUsed: { $max: '$lastTagged' }
}
},
{ $sort: this.getSortCriteria(options.sortBy) }
];
if (options.limit) {
pipeline.push({ $limit: options.limit });
}
// Execute aggregation (simplified for this implementation)
const tagCounts = await this.getTagCountsFromDocuments(collection);
const totalDocuments = await this.storage.countDocuments(collection, {});
const stats = Array.from(tagCounts.entries()).map(([tag, count]) => ({
tag,
count,
lastUsed: this.tagStats.get(tag)?.lastUsed || new Date(),
percentage: totalDocuments > 0 ? (count / totalDocuments) * 100 : 0
}));
// Sort based on criteria
stats.sort((a, b) => {
switch (options.sortBy) {
case 'name':
return a.tag.localeCompare(b.tag);
case 'recent':
return b.lastUsed.getTime() - a.lastUsed.getTime();
case 'count':
default:
return b.count - a.count;
}
});
return stats.slice(0, options.limit || 100);
}
catch (error) {
throw new DocumentError(`Tag stats failed: ${error.message}`, 'TAG_STATS_ERROR');
}
}
/**
* Clean up unused tags
*/
async cleanupUnusedTags(collection, options = {}) {
try {
const stats = await this.getTagStats(collection, { includeUnused: true });
const threshold = options.usageThreshold || 1;
const cutoffDate = options.olderThan || new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago
const tagsToRemove = stats.filter(stat => stat.count < threshold || stat.lastUsed < cutoffDate).map(stat => stat.tag);
if (tagsToRemove.length === 0) {
return { removedTags: [], documentsUpdated: 0 };
}
// Remove tags from documents
const updateResult = await this.storage.updateOne(collection, {
tags: { $in: tagsToRemove }
}, {
$pullAll: { tags: tagsToRemove }
}, { multi: true });
// Clean up tag statistics
for (const tag of tagsToRemove) {
this.tagStats.delete(tag);
}
return {
removedTags: tagsToRemove,
documentsUpdated: updateResult.modifiedCount
};
}
catch (error) {
throw new DocumentError(`Tag cleanup failed: ${error.message}`, 'TAG_CLEANUP_ERROR');
}
}
/**
* Bulk tag documents matching a filter
*/
async bulkTag(collection, filter, tags, options = {}) {
try {
const batchSize = options.batchSize || 1000;
let totalUpdated = 0;
const errors = [];
// Process documents in batches
let skip = 0;
while (true) {
const documents = await this.storage.find(collection, filter, {
skip,
limit: batchSize,
projection: { _id: 1, tags: 1 }
});
if (documents.length === 0)
break;
const bulkOps = documents.map(doc => {
const finalTags = options.merge !== false
? [...new Set([...(doc.tags || []), ...tags])]
: tags;
return {
updateOne: {
filter: { _id: doc._id },
update: {
$set: {
tags: finalTags,
lastTagged: new Date()
}
}
}
};
});
try {
// This would use the bulk operations manager
// For now, we'll update individually
for (const doc of documents) {
const finalTags = options.merge !== false
? [...new Set([...(doc.tags || []), ...tags])]
: tags;
await this.storage.updateOne(collection, { _id: doc._id }, {
$set: {
tags: finalTags,
lastTagged: new Date()
}
});
totalUpdated++;
}
}
catch (error) {
errors.push(`Batch error: ${error.message}`);
}
skip += batchSize;
}
return { documentsUpdated: totalUpdated, errors };
}
catch (error) {
throw new DocumentError(`Bulk tagging failed: ${error.message}`, 'BULK_TAG_ERROR');
}
}
// ===============================
// Private Methods
// ===============================
extractContentTags(document) {
const tags = new Set();
// Extract tags from text content
const text = this.extractText(document).toLowerCase();
// Common tag patterns
const patterns = [
// Programming languages
/\b(javascript|python|java|typescript|rust|go|cpp|php|ruby|swift)\b/g,
// Technologies
/\b(react|vue|angular|node|express|django|flask|spring|docker|kubernetes)\b/g,
// Concepts
/\b(algorithm|database|api|frontend|backend|devops|security|testing)\b/g
];
for (const pattern of patterns) {
const matches = text.match(pattern);
if (matches) {
matches.forEach(match => tags.add(match));
}
}
// Extract hashtags
const hashtags = text.match(/#(\w+)/g);
if (hashtags) {
hashtags.forEach(tag => tags.add(tag.substring(1)));
}
// Extract @mentions as tags
const mentions = text.match(/@(\w+)/g);
if (mentions) {
mentions.forEach(mention => tags.add(`user:${mention.substring(1)}`));
}
return Array.from(tags);
}
extractMetadataTags(document) {
const tags = new Set();
// Extract from common metadata fields
const metadataFields = ['category', 'type', 'status', 'priority', 'department'];
for (const field of metadataFields) {
const value = document[field];
if (typeof value === 'string') {
tags.add(`${field}:${value.toLowerCase()}`);
}
else if (Array.isArray(value)) {
value.forEach(v => {
if (typeof v === 'string') {
tags.add(`${field}:${v.toLowerCase()}`);
}
});
}
}
// Extract date-based tags
if (document._createdAt) {
const date = new Date(document._createdAt);
tags.add(`year:${date.getFullYear()}`);
tags.add(`month:${date.getFullYear()}-${String(date.getMonth() + 1).padStart(2, '0')}`);
}
return Array.from(tags);
}
extractFilenameTags(document) {
const tags = new Set();
const filename = document.filename || document.name || document.title;
if (typeof filename === 'string') {
// Extract file extension
const extension = filename.split('.').pop()?.toLowerCase();
if (extension && extension !== filename.toLowerCase()) {
tags.add(`ext:${extension}`);
}
// Extract words from filename
const words = filename
.replace(/[^a-zA-Z0-9]/g, ' ')
.split(' ')
.filter(word => word.length > 2)
.map(word => word.toLowerCase());
words.forEach(word => tags.add(word));
}
return Array.from(tags);
}
applyTagMapping(tags, mapping) {
const mappedTags = new Set();
for (const tag of tags) {
if (mapping[tag]) {
mapping[tag].forEach(mappedTag => mappedTags.add(mappedTag));
}
else {
mappedTags.add(tag);
}
}
return Array.from(mappedTags);
}
async addHierarchicalTags(tags) {
const allTags = new Set(tags);
for (const tag of tags) {
const parents = this.tagHierarchy.get(tag);
if (parents) {
parents.forEach(parent => allTags.add(parent));
}
}
return Array.from(allTags);
}
async expandTagsWithHierarchy(tags) {
const expandedTags = new Set(tags);
// Add child tags for each parent tag
for (const [childTag, parents] of this.tagHierarchy.entries()) {
if (parents.some(parent => tags.includes(parent))) {
expandedTags.add(childTag);
}
}
return Array.from(expandedTags);
}
updateTagStats(tags) {
const now = new Date();
for (const tag of tags) {
const current = this.tagStats.get(tag) || { count: 0, lastUsed: now };
this.tagStats.set(tag, {
count: current.count + 1,
lastUsed: now
});
}
}
cacheCollectionTags(collection, tags) {
const cached = this.tagCache.get(collection) || new Set();
tags.forEach(tag => cached.add(tag));
this.tagCache.set(collection, cached);
}
async validateTags(tags) {
for (const tag of tags) {
if (typeof tag !== 'string' || tag.trim().length === 0) {
throw new DocumentError(`Invalid tag: '${tag}'`, 'INVALID_TAG');
}
if (tag.length > 50) {
throw new DocumentError(`Tag too long: '${tag}' (max 50 characters)`, 'TAG_TOO_LONG');
}
if (!/^[a-zA-Z0-9:._-]+$/.test(tag)) {
throw new DocumentError(`Invalid tag format: '${tag}'`, 'INVALID_TAG_FORMAT');
}
}
}
async getPopularTags(collection, options = {}) {
const stats = await this.getTagStats(collection, {
limit: options.limit || 10,
sortBy: 'count'
});
return stats.map(stat => stat.tag);
}
async getSimilarDocumentTags(collection, document) {
// Find documents with similar content/metadata
const filter = {};
if (document.category) {
filter.category = document.category;
}
if (document.type) {
filter.type = document.type;
}
const similarDocs = await this.storage.find(collection, filter, { limit: 10 });
const tags = new Set();
for (const doc of similarDocs) {
if (doc.tags && Array.isArray(doc.tags)) {
doc.tags.forEach(tag => tags.add(tag));
}
}
return Array.from(tags);
}
async scoreSuggestions(suggestions, collection, document) {
const scored = suggestions.map(tag => {
let score = 0.5; // Base score
// Boost score based on tag popularity
const stats = this.tagStats.get(tag);
if (stats) {
score += Math.min(stats.count / 100, 0.3); // Max 0.3 boost
}
// Boost score for recent usage
if (stats && stats.lastUsed > new Date(Date.now() - 7 * 24 * 60 * 60 * 1000)) {
score += 0.2;
}
// Boost score for content relevance
const text = this.extractText(document).toLowerCase();
if (text.includes(tag.toLowerCase())) {
score += 0.3;
}
return { tag, score: Math.min(score, 1.0) };
});
return scored.sort((a, b) => b.score - a.score);
}
extractText(document) {
const textFields = ['title', 'content', 'description', 'text', 'name', 'summary'];
const texts = [];
const extractTextRecursive = (obj, depth = 0) => {
if (depth > 3)
return;
for (const [key, value] of Object.entries(obj)) {
if (typeof value === 'string' && (textFields.includes(key) || key.includes('text'))) {
texts.push(value);
}
else if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
extractTextRecursive(value, depth + 1);
}
}
};
extractTextRecursive(document);
return texts.join(' ');
}
async getTagCountsFromDocuments(collection) {
// Simplified implementation - in a real scenario, this would use aggregation
const documents = await this.storage.find(collection, { tags: { $exists: true } });
const tagCounts = new Map();
for (const doc of documents) {
if (doc.tags && Array.isArray(doc.tags)) {
for (const tag of doc.tags) {
tagCounts.set(tag, (tagCounts.get(tag) || 0) + 1);
}
}
}
return tagCounts;
}
getSortCriteria(sortBy) {
switch (sortBy) {
case 'name':
return { _id: 1 };
case 'recent':
return { lastUsed: -1 };
case 'count':
default:
return { count: -1 };
}
}
async persistTagHierarchy() {
// Store tag hierarchy in the database
const hierarchyDoc = {
_id: 'tag_hierarchy',
hierarchy: Object.fromEntries(this.tagHierarchy),
updatedAt: new Date()
};
try {
await this.storage.updateOne('_system_metadata', { _id: 'tag_hierarchy' }, { $set: hierarchyDoc }, { upsert: true });
}
catch (error) {
console.warn('Failed to persist tag hierarchy:', error.message);
}
}
}
//# sourceMappingURL=tagging-system.js.map