yaytt
Version:
Blazingly fast YouTube caption extractor with deduplication.
309 lines (308 loc) • 13.3 kB
JavaScript
export class CaptionDeduplicator {
constructor(options = {}) {
this.options = {
enabled: true,
timeThreshold: 3,
similarityThreshold: 0.8,
mergePartialMatches: true,
aggressiveMode: false,
...options
};
}
deduplicate(captions) {
if (!this.options.enabled || captions.length === 0) {
return captions;
}
const sortedCaptions = [...captions].sort((a, b) => a.start - b.start);
const groups = this.groupSimilarCaptions(sortedCaptions);
let result = groups.map(group => this.mergeCaptions(group));
result = this.removeTextContinuations(result);
result = this.finalCleanup(result);
if (this.options.aggressiveMode) {
result = this.ultraAggressiveCleanup(result);
}
return result;
}
groupSimilarCaptions(captions) {
const groups = [];
const used = new Set();
for (let i = 0; i < captions.length; i++) {
if (used.has(i))
continue;
const group = [captions[i]];
used.add(i);
const currentText = this.cleanText(captions[i].text);
for (let j = i + 1; j < captions.length; j++) {
if (used.has(j))
continue;
const timeDiff = captions[j].start - captions[i].start;
if (timeDiff > this.options.timeThreshold)
break;
const otherText = this.cleanText(captions[j].text);
if (this.areSimilarCaptions(currentText, otherText)) {
group.push(captions[j]);
used.add(j);
}
}
groups.push(group);
}
return groups;
}
areSimilarCaptions(text1, text2) {
if (text1 === text2)
return true;
if (text1.includes(text2) || text2.includes(text1))
return true;
const similarity = this.calculateSimilarity(text1, text2);
if (similarity >= this.options.similarityThreshold)
return true;
const words1 = text1.split(' ').filter(w => w.length > 2);
const words2 = text2.split(' ').filter(w => w.length > 2);
if (words1.length === 0 || words2.length === 0)
return false;
const commonWords = words1.filter(word => words2.includes(word));
const wordSimilarity = commonWords.length / Math.max(words1.length, words2.length);
return wordSimilarity >= 0.6;
}
removeTextContinuations(captions) {
const result = [];
for (let i = 0; i < captions.length; i++) {
const current = captions[i];
const currentClean = this.cleanText(current.text);
let shouldSkip = false;
// Check if this caption is a continuation of a previous one
for (let j = Math.max(0, i - 3); j < i; j++) {
const previous = captions[j];
const previousClean = this.cleanText(previous.text);
// Skip if current text is already contained in a recent previous caption
if (previousClean.includes(currentClean) && currentClean.length < previousClean.length * 0.9) {
shouldSkip = true;
break;
}
// Skip if this looks like a partial repeat
if (this.isPartialRepeat(previousClean, currentClean)) {
shouldSkip = true;
break;
}
}
// Check if next caption contains this one (look ahead)
if (!shouldSkip) {
for (let j = i + 1; j < Math.min(captions.length, i + 3); j++) {
const next = captions[j];
const nextClean = this.cleanText(next.text);
// Skip if next caption contains this one and is much longer
if (nextClean.includes(currentClean) &&
next.start - current.start <= this.options.timeThreshold &&
currentClean.length < nextClean.length * 0.8) {
shouldSkip = true;
break;
}
}
}
if (!shouldSkip) {
result.push(current);
}
}
return result;
}
isPartialRepeat(text1, text2) {
// Check if text2 starts with the same words as text1 ends with
const words1 = text1.split(' ');
const words2 = text2.split(' ');
if (words1.length < 3 || words2.length < 3)
return false;
// Get last few words of text1 and first few words of text2
const lastWords1 = words1.slice(-3).join(' ');
const firstWords2 = words2.slice(0, 3).join(' ');
// Check for overlap
return lastWords1.includes(firstWords2) || firstWords2.includes(lastWords1);
}
finalCleanup(captions) {
const result = [];
for (let i = 0; i < captions.length; i++) {
const current = captions[i];
const currentClean = this.cleanText(current.text);
// Skip very short captions that are likely noise
if (currentClean.length < 10) {
continue;
}
// Skip if identical to previous caption
if (result.length > 0) {
const lastClean = this.cleanText(result[result.length - 1].text);
if (currentClean === lastClean) {
continue;
}
}
result.push(current);
}
return result;
}
ultraAggressiveCleanup(captions) {
const result = [];
for (let i = 0; i < captions.length; i++) {
const current = captions[i];
const currentClean = this.cleanText(current.text);
let shouldSkip = false;
// Check against the last few captions in result
for (let j = Math.max(0, result.length - 5); j < result.length; j++) {
const existing = result[j];
const existingClean = this.cleanText(existing.text);
// Skip if this caption seems to continue the previous one
if (this.isContinuation(existingClean, currentClean)) {
// Merge the longer text into the existing caption
if (currentClean.length > existingClean.length) {
result[j] = current; // Replace with longer version
}
shouldSkip = true;
break;
}
// Skip if significant word overlap (even if not exact continuation)
if (this.hasSignificantWordOverlap(existingClean, currentClean)) {
// Keep the longer, more complete one
if (currentClean.length > existingClean.length) {
result[j] = current;
}
shouldSkip = true;
break;
}
}
if (!shouldSkip) {
result.push(current);
}
}
return result;
}
isContinuation(text1, text2) {
// Check if text2 continues where text1 left off
const words1 = text1.split(' ');
const words2 = text2.split(' ');
if (words1.length < 3 || words2.length < 3)
return false;
// Get last 3 words of text1 and first 5 words of text2
const lastWords1 = words1.slice(-3);
const firstWords2 = words2.slice(0, 5);
// Check if any of the last words appear in the first words
const overlap = lastWords1.filter(word => firstWords2.includes(word));
return overlap.length >= 2; // At least 2 words overlap
}
hasSignificantWordOverlap(text1, text2) {
const words1 = text1.split(' ').filter(w => w.length > 3); // Only meaningful words
const words2 = text2.split(' ').filter(w => w.length > 3);
if (words1.length < 3 || words2.length < 3)
return false;
const commonWords = words1.filter(word => words2.includes(word));
const overlapRatio = commonWords.length / Math.min(words1.length, words2.length);
return overlapRatio >= 0.7; // 70% overlap of meaningful words
}
mergeCaptions(captions) {
// Sort by start time
captions.sort((a, b) => a.start - b.start);
// Find the longest, most complete text
const longestCaption = captions.reduce((longest, current) => {
const currentClean = this.cleanText(current.text);
const longestClean = this.cleanText(longest.text);
// Prefer longer, more complete text
if (currentClean.length > longestClean.length) {
return current;
}
// If similar length, prefer the one without trailing incomplete words
if (Math.abs(currentClean.length - longestClean.length) < 10) {
if (this.isMoreComplete(currentClean, longestClean)) {
return current;
}
}
return longest;
});
// Use the earliest start time and calculate proper duration
const earliestStart = Math.min(...captions.map(c => c.start));
const latestEnd = Math.max(...captions.map(c => c.start + c.dur));
return {
start: earliestStart,
dur: latestEnd - earliestStart,
text: longestCaption.text
};
}
removeRedundantCaptions(captions) {
const result = [];
for (let i = 0; i < captions.length; i++) {
const current = captions[i];
const currentClean = this.cleanText(current.text);
// Skip if this caption's text is completely contained in the next one
if (i < captions.length - 1) {
const next = captions[i + 1];
const nextClean = this.cleanText(next.text);
// If current text is fully contained in next and they're close in time
if (nextClean.includes(currentClean) &&
next.start - current.start <= this.options.timeThreshold &&
currentClean.length < nextClean.length * 0.8) {
continue; // Skip this redundant caption
}
}
// Skip if this caption's text is completely contained in the previous one
if (result.length > 0) {
const previous = result[result.length - 1];
const previousClean = this.cleanText(previous.text);
if (previousClean.includes(currentClean) &&
current.start - previous.start <= this.options.timeThreshold &&
currentClean.length < previousClean.length * 0.8) {
continue; // Skip this redundant caption
}
}
result.push(current);
}
return result;
}
calculateSimilarity(text1, text2) {
if (text1 === text2)
return 1;
const longer = text1.length > text2.length ? text1 : text2;
const shorter = text1.length > text2.length ? text2 : text1;
if (longer.length === 0)
return 1;
// Check if shorter text is contained in longer text
if (longer.includes(shorter)) {
return shorter.length / longer.length;
}
// Calculate Levenshtein distance for more complex similarity
return (longer.length - this.levenshteinDistance(text1, text2)) / longer.length;
}
levenshteinDistance(str1, str2) {
const matrix = [];
for (let i = 0; i <= str2.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= str1.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= str2.length; i++) {
for (let j = 1; j <= str1.length; j++) {
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
}
else {
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1);
}
}
}
return matrix[str2.length][str1.length];
}
cleanText(text) {
return text
.toLowerCase()
.replace(/\[música\]/gi, '') // Remove [Música] tags
.replace(/\[music\]/gi, '') // Remove [Music] tags
.replace(/[^\w\s]/g, ' ') // Remove punctuation
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
isMoreComplete(text1, text2) {
// Check if text1 ends with a complete word while text2 doesn't
const words1 = text1.split(' ');
const words2 = text2.split(' ');
if (words1.length === words2.length) {
// Same number of words, prefer the one that doesn't cut off mid-sentence
return !text1.endsWith(' ') && text2.endsWith(' ');
}
return words1.length > words2.length;
}
}