vexify
Version:
Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.
98 lines (76 loc) • 2.56 kB
JavaScript
;
class TextDeduplicator {
constructor(minChunkSize = 50, minOccurrences = 2) {
this.minChunkSize = minChunkSize;
this.minOccurrences = minOccurrences;
this.commonPhrases = new Map();
this.analyzed = false;
this.maxPhrases = 1000;
}
analyzeDocuments(documents) {
this.commonPhrases.clear();
const phraseCounts = new Map();
for (const doc of documents) {
const phrases = this.extractPhrasesOptimized(doc);
const seen = new Set();
for (const phrase of phrases) {
if (seen.has(phrase)) continue;
seen.add(phrase);
phraseCounts.set(phrase, (phraseCounts.get(phrase) || 0) + 1);
}
}
const sortedPhrases = Array.from(phraseCounts.entries())
.filter(([_, count]) => count >= this.minOccurrences)
.sort((a, b) => b[1] - a[1])
.slice(0, this.maxPhrases);
for (const [phrase, count] of sortedPhrases) {
this.commonPhrases.set(phrase, count);
}
this.analyzed = true;
return this.commonPhrases.size;
}
extractPhrasesOptimized(text) {
const phrases = new Set();
const normalized = text.replace(/\s+/g, ' ').trim();
const words = normalized.split(' ');
if (words.length < 3) return phrases;
const wordCounts = [3, 5, 7, 10];
for (const wordCount of wordCounts) {
if (words.length < wordCount) continue;
const step = Math.max(1, Math.floor(wordCount / 2));
for (let i = 0; i <= words.length - wordCount; i += step) {
const phrase = words.slice(i, i + wordCount).join(' ');
if (phrase.length >= this.minChunkSize) {
phrases.add(phrase);
}
}
}
return phrases;
}
deduplicate(text) {
if (!this.analyzed || this.commonPhrases.size === 0) {
return text;
}
const normalized = text.replace(/\s+/g, ' ').trim();
let result = normalized;
const minContentLength = Math.max(200, normalized.length * 0.3);
const sortedPhrases = Array.from(this.commonPhrases.keys())
.sort((a, b) => b.length - a.length);
for (const phrase of sortedPhrases) {
if (result.includes(phrase)) {
const afterRemoval = result.split(phrase).join('').replace(/\s+/g, ' ').trim();
if (afterRemoval.length >= minContentLength) {
result = afterRemoval;
}
}
}
return result.replace(/\s+/g, ' ').trim();
}
getStats() {
return {
commonPhrases: this.commonPhrases.size,
analyzed: this.analyzed
};
}
}
module.exports = { TextDeduplicator };