UNPKG

summarizely-cli

Version:

YouTube summarizer that respects your existing subscriptions. No API keys required.

83 lines 3.87 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.parseVttToTranscript = parseVttToTranscript; // VTT parser that deduplicates overlapping segments and outputs as single line function parseVttToTranscript(vtt) { const lines = vtt.replace(/\r/g, '').split('\n'); let fullTranscript = ''; let i = 0; while (i < lines.length) { const line = lines[i].trim(); i++; if (!line) continue; // Timestamp line like: 00:00:01.000 --> 00:00:04.000 if (/^\d{2}:\d{2}:\d{2}\.\d{3}\s+-->\s+\d{2}:\d{2}:\d{2}\.\d{3}/.test(line)) { const textLines = []; while (i < lines.length && lines[i].trim() !== '') { textLines.push(lines[i]); i++; } // Clean the segment text (remove HTML tags, normalize spaces) const segmentText = textLines.join(' ') .replace(/<[^>]+>/g, '') .replace(/\s+/g, ' ') .trim(); if (segmentText) { if (!fullTranscript) { // First segment - just add it fullTranscript = segmentText; } else { // Find where this segment overlaps with existing transcript // YouTube captions typically repeat the last few words const overlap = findOverlap(fullTranscript, segmentText); if (overlap > 0) { // Add only the new portion after the overlap const newContent = segmentText.substring(overlap).trim(); if (newContent) { fullTranscript += ' ' + newContent; } } else { // No overlap found - this might be a new sentence/section fullTranscript += ' ' + segmentText; } } } } } // Return as single line with normalized spaces return fullTranscript.replace(/\s+/g, ' ').trim(); } // Helper function to find where new segment overlaps with existing text function findOverlap(existing, newSegment) { // Look for overlap in the last portion of existing text // Start with longer overlaps and work down to shorter ones const maxOverlapLength = Math.min(existing.length, newSegment.length); const existingLower = existing.toLowerCase(); const newSegmentLower = newSegment.toLowerCase(); // Check progressively smaller portions of the end of existing text for (let overlapSize = Math.min(100, maxOverlapLength); overlapSize >= 10; overlapSize--) { const existingTail = existingLower.slice(-overlapSize); const newSegmentHead = newSegmentLower.slice(0, overlapSize); if (existingTail === newSegmentHead) { return overlapSize; } } // Try to find partial word overlap (common in captions) const lastWords = existing.split(' ').slice(-5).join(' ').toLowerCase(); const firstWords = newSegment.split(' ').slice(0, 5).join(' ').toLowerCase(); // Check if the beginning of new segment appears at the end of existing for (let wordCount = 4; wordCount >= 1; wordCount--) { const lastNWords = existing.split(' ').slice(-wordCount).join(' ').toLowerCase(); const firstNWords = newSegment.split(' ').slice(0, wordCount).join(' ').toLowerCase(); if (lastNWords === firstNWords) { // Return the character position where overlap starts const overlapIndex = existing.toLowerCase().lastIndexOf(lastNWords); return overlapIndex >= 0 ? existing.length - overlapIndex : 0; } } return 0; // No overlap found } //# sourceMappingURL=vtt.js.map