UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

461 lines 18.9 kB
import { htmlToText } from 'html-to-text'; import { secondsToHMS, secondsToMS } from '../utilities/Utilities.js'; import { isWordOrSymbolWord } from '../nlp/Segmentation.js'; import { charactersToWriteAhead } from '../audio/AudioPlayer.js'; import { readFileAsUtf8 } from '../utilities/FileSystem.js'; import { deepClone } from '../utilities/ObjectUtilities.js'; import { formatHMS, formatMS } from '../utilities/StringUtilities.js'; import { anyOf, buildRegExp, capture, digit, inputStart, matches, notUnicodeProperty, oneOrMore, zeroOrMore } from 'regexp-composer'; export async function subtitlesFileToText(filename) { return subtitlesToText(await readFileAsUtf8(filename)); } export function subtitlesToText(subtitles) { return subtitlesToTimeline(subtitles, true).map(entry => entry.text).join(' '); } export function subtitlesToTimeline(subtitles, removeMarkup = true) { const lines = subtitles.split(/\r?\n/); const timeline = []; let isWithinCue = false; // Parse lines of subtitles text for (let line of lines) { line = line.trim(); if (line.length == 0) { isWithinCue = false; continue; } let result = tryParseTimeRangePatternWithHours(line); if (!result.succeeded) { result = tryParseTimeRangePatternWithoutHours(line); } if (result.succeeded) { timeline.push({ type: 'segment', startTime: result.startTime, endTime: result.endTime, text: '' }); isWithinCue = true; } else if (isWithinCue && timeline.length > 0) { const lastEntry = timeline[timeline.length - 1]; if (lastEntry.text == '') { lastEntry.text = line; } else { lastEntry.text += ' ' + line; } } } if (!removeMarkup) { return timeline; } // Remove markup in each entry text const timelineWithoutMarkup = timeline.map((entry) => { let plainText = entry.text; plainText = plainText.replaceAll(/<[^>]*>/g, ''); plainText = htmlToText(plainText, { wordwrap: false }); plainText = plainText.replaceAll(/\s+/g, ' ').trim(); return { ...entry, text: plainText }; }); return timelineWithoutMarkup; } export function timelineToSubtitles(timeline, subtitlesConfig) { // Prepare subtitle configuration timeline = deepClone(timeline); let config = subtitlesConfig || {}; if (config.format && config.format == 'webvtt') { config = { ...defaultSubtitlesBaseConfig, ...webVttConfigExtension, ...config }; } else { config = { ...defaultSubtitlesBaseConfig, ...srtConfigExtension, ...config }; } // Initialize subtitle file content const lineBreakString = config.lineBreakString; let outText = ''; if (config.format == 'webvtt') { outText += `WEBVTT${lineBreakString}Kind: captions${lineBreakString}`; if (config.language) { outText += `Language: ${config.language}${lineBreakString}`; } outText += lineBreakString; } // Generate the cues from the given timeline let cues; if (config.mode == 'segment' || config.mode == 'sentence') { cues = getCuesFromTimeline_IsolateSegmentSentence(timeline, config); } else if (config.mode == 'word' || config.mode == 'phone' || config.mode == 'word+phone') { cues = getCuesFromTimeline_IsolateWordPhone(timeline, config); } else if (config.mode == 'line') { cues = getCuesFromTimeline_IsolateLines(timeline, config); } else { throw new Error('Invalid subtitles mode.'); } // Extend cue end times with maximum added duration, if possible if (cues.length > 0 && config.maxAddedDuration > 0 && (config.mode === 'segment' || config.mode === 'sentence' || config.mode === 'line')) { for (let i = 1; i < cues.length; i++) { const currentCue = cues[i]; const previousCue = cues[i - 1]; previousCue.endTime = Math.min(previousCue.endTime + config.maxAddedDuration, currentCue.startTime); } if (config.totalDuration != null) { const lastCue = cues[cues.length - 1]; lastCue.endTime = Math.min(lastCue.endTime + config.maxAddedDuration, config.totalDuration); } } // Write cues to output text for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) { outText += cueObjectToText(cues[cueIndex], cueIndex + 1, config); } return outText; } // Generates subtitle cues from timeline. Ensures each segment or sentence starts in a new cue. function getCuesFromTimeline_IsolateSegmentSentence(timeline, config) { if (timeline.length === 0) { return []; } // If the given timeline is a word timeline, wrap it with a segment and call again if (timeline[0].type === 'word') { const wordTimeline = timeline.filter(entry => isWordOrSymbolWord(entry.text)); const text = wordTimeline.map(entry => entry.text).join(' '); const segmentEntry = { type: 'segment', text: text, startTime: wordTimeline[0].startTime, endTime: wordTimeline[wordTimeline.length - 1].endTime, timeline: wordTimeline }; return getCuesFromTimeline_IsolateSegmentSentence([segmentEntry], config); } const cues = []; // Generate one or more cues from each segment or sentence in the timeline. for (let entry of timeline) { if (!entry.timeline || entry.timeline.length === 0) { continue; } if (entry.type === 'segment' && entry.timeline?.[0]?.type === 'sentence') { if (config.mode === 'segment') { // If the mode is 'segment', flatten all sentences to a single word timeline entry.timeline = entry.timeline.flatMap(t => t.timeline); } else { cues.push(...getCuesFromTimeline_IsolateSegmentSentence(entry.timeline, config)); continue; } } const entryText = entry.text; const maxLineWidth = config.maxLineWidth; if (entryText.length <= maxLineWidth) { cues.push({ lines: [entryText], startTime: entry.startTime, endTime: entry.endTime }); continue; } if (!entry.timeline || entry.timeline?.[0]?.type != 'word') { continue; } const wordTimeline = entry.timeline.filter(entry => isWordOrSymbolWord(entry.text)); // First, add word start and end offsets for all word entries let lastWordEndOffset = 0; for (const wordEntry of wordTimeline) { const wordStartOffset = entryText.indexOf(wordEntry.text, lastWordEndOffset); if (wordStartOffset == -1) { throw new Error(`Couldn't find word '${wordEntry.text}' in its parent entry text`); } let wordEndOffset = wordStartOffset + wordEntry.text.length; lastWordEndOffset = wordEndOffset; wordEntry.startOffsetUtf16 = wordStartOffset; wordEntry.endOffsetUtf16 = wordEndOffset; } // Add cues let currentCue = { lines: [], startTime: -1, endTime: -1 }; let lineStartWordOffset = 0; let lineStartOffset = 0; for (let wordIndex = 0; wordIndex < wordTimeline.length; wordIndex++) { const isLastWord = wordIndex == wordTimeline.length - 1; const wordEntry = wordTimeline[wordIndex]; const wordEndOffset = wordEntry.endOffsetUtf16; function getExtendedEndOffset(offset) { if (offset == undefined) { return entryText.length; } while (charactersToWriteAhead.includes(entryText[offset])) { offset += 1; } return offset; } const wordExtendedEndOffset = getExtendedEndOffset(wordEndOffset); const nextWordEntry = wordTimeline[wordIndex + 1]; const nextWordExtendedEndOffset = getExtendedEndOffset(nextWordEntry?.endOffsetUtf16); // Decide if to add to a new line const lineLength = wordExtendedEndOffset - lineStartOffset; const lineLengthWithNextWord = nextWordExtendedEndOffset - lineStartOffset; const wordsRemaining = wordTimeline.length - wordIndex - 1; const lineLengthWithNextWordExceedsMaxLineWidth = lineLengthWithNextWord >= maxLineWidth; const lineLengthExceedsHalfMaxLineWidth = lineLength >= maxLineWidth / 2; const wordsRemainingAreEqualOrLessToMinimumWordsInLine = wordsRemaining <= config.minWordsInLine; const remainingTextExceedsMaxLineWidth = entryText.length - lineStartOffset > maxLineWidth; const followingSubstringIsPhraseSeparator = phraseSeparatorRegExp.test(entryText.substring(wordEndOffset)); const shouldAddNewLine = isLastWord || lineLengthWithNextWordExceedsMaxLineWidth || (remainingTextExceedsMaxLineWidth && lineLengthExceedsHalfMaxLineWidth && (wordsRemainingAreEqualOrLessToMinimumWordsInLine || (config.separatePhrases && followingSubstringIsPhraseSeparator))); // If it was decided to add a new line if (shouldAddNewLine) { // Extend line end offset to end of sentence entry if last word encountered let lineEndOffset; if (isLastWord) { lineEndOffset = entryText.length; } else { lineEndOffset = wordExtendedEndOffset; } // Get line text const lineText = entryText.substring(lineStartOffset, lineEndOffset); // Find start and end times of line const nextWordStartTime = isLastWord ? entry.endTime : wordTimeline[wordIndex + 1].startTime; const lineStartTime = wordTimeline[lineStartWordOffset].startTime; const lineEndTime = nextWordStartTime; // Add new line to cue currentCue.lines.push(lineText); // Update cue start and end times if (currentCue.startTime == -1) { currentCue.startTime = lineStartTime; } currentCue.endTime = lineEndTime; // Finalize cue if needed if (isLastWord || currentCue.lines.length == config.maxLineCount) { cues.push(currentCue); currentCue = { lines: [], startTime: -1, endTime: -1 }; } // Update offsets lineStartOffset = lineEndOffset; lineStartWordOffset = wordIndex + 1; } } } return cues; } // Generates cues from timeline. Isolates words or phones in individual cues. function getCuesFromTimeline_IsolateWordPhone(timeline, config) { if (timeline.length == 0) { return []; } const mode = config.mode; const cues = []; for (const entry of timeline) { const entryIsWord = entry.type == 'word'; const entryIsPhone = entry.type == 'phone'; const shouldIncludeEntry = (entryIsWord && (mode == 'word' || mode == 'word+phone')) || (entryIsPhone && (mode == 'phone' || mode == 'word+phone')); if (shouldIncludeEntry) { cues.push({ lines: [entry.text], startTime: entry.startTime, endTime: entry.endTime, }); } if (entry.timeline) { cues.push(...getCuesFromTimeline_IsolateWordPhone(entry.timeline, config)); } } return cues; } // Generates cues from timeline. Isolates lines in individual cues. function getCuesFromTimeline_IsolateLines(timeline, config) { if (timeline.length == 0) { return []; } const originalText = config.originalText; if (originalText == null) { throw new Error(`'line' subtitles mode requires passing the original text in the 'originalText' property of the configuration object.`); } const lines = originalText.split(/(\r?\n)/g); const charOffsetToLineNumber = []; for (let lineNumber = 0; lineNumber < lines.length; lineNumber++) { const line = lines[lineNumber]; for (let i = 0; i < line.length; i++) { charOffsetToLineNumber.push(lineNumber); } } const cues = []; let currentCueWords = []; function addCueFromCurrentWords() { if (currentCueWords.length == 0) { return; } const firstWordEntry = currentCueWords[0]; const lastWordEntry = currentCueWords[currentCueWords.length - 1]; const lineNumber = charOffsetToLineNumber[firstWordEntry.startOffsetUtf16]; const line = lines[lineNumber].trim(); cues.push({ lines: [line], startTime: firstWordEntry.startTime, endTime: lastWordEntry.endTime }); currentCueWords = []; } function addCuesFrom(timeline) { for (const entry of timeline) { if (entry.type == 'word') { const currentWordLineNumber = charOffsetToLineNumber[entry.startOffsetUtf16]; const previousWordEntry = currentCueWords[currentCueWords.length - 1]; if (previousWordEntry) { const previousWordLineNumber = charOffsetToLineNumber[previousWordEntry.startOffsetUtf16]; if (currentWordLineNumber > previousWordLineNumber) { addCueFromCurrentWords(); } } currentCueWords.push(entry); } else if (entry.timeline) { addCuesFrom(entry.timeline); } } } addCuesFrom(timeline); addCueFromCurrentWords(); // Add any remaining words return cues; } export function tryParseTimeRangePatternWithHours(line) { const match = timeRangeWithHoursRegExp.exec(line); if (!match) { return { startTime: -1, endTime: -1, succeeded: false }; } const startHours = parseInt(match[1]); const startMinutes = parseInt(match[2]); const startSeconds = parseInt(match[3]); const startMilliseconds = parseInt(match[4]); const endHours = parseInt(match[5]); const endMinutes = parseInt(match[6]); const endSeconds = parseInt(match[7]); const endMilliseconds = parseInt(match[8]); const startTime = (startMilliseconds / 1000) + (startSeconds) + (startMinutes * 60) + (startHours * 60 * 60); const endTime = (endMilliseconds / 1000) + (endSeconds) + (endMinutes * 60) + (endHours * 60 * 60); return { startTime, endTime, succeeded: true }; } export function tryParseTimeRangePatternWithoutHours(line) { const match = timeRangeWithoutHoursRegExp.exec(line); if (!match) { return { startTime: -1, endTime: -1, succeeded: false }; } const startMinutes = parseInt(match[1]); const startSeconds = parseInt(match[2]); const startMilliseconds = parseInt(match[3]); const endMinutes = parseInt(match[4]); const endSeconds = parseInt(match[5]); const endMilliseconds = parseInt(match[6]); const startTime = (startMilliseconds / 1000) + (startSeconds) + (startMinutes * 60); const endTime = (endMilliseconds / 1000) + (endSeconds) + (endMinutes * 60); return { startTime, endTime, succeeded: true }; } function cueObjectToText(cue, cueIndex, config) { if (!cue || !cue.lines || cue.lines.length == 0) { throw new Error(`Cue is empty`); } const lineBreakString = config.lineBreakString; let outText = ''; if (config.includeCueIndexes) { outText += `${cueIndex}${lineBreakString}`; } let formattedStartTime; let formattedEndTime; if (config.includeHours == true) { formattedStartTime = formatHMS(secondsToHMS(cue.startTime), config.decimalSeparator); formattedEndTime = formatHMS(secondsToHMS(cue.endTime), config.decimalSeparator); } else { formattedStartTime = formatMS(secondsToMS(cue.startTime), config.decimalSeparator); formattedEndTime = formatMS(secondsToMS(cue.endTime), config.decimalSeparator); } outText += `${formattedStartTime} --> ${formattedEndTime}`; outText += `${lineBreakString}`; outText += cue.lines.map(line => line.trim()).join(lineBreakString); outText += `${lineBreakString}`; outText += `${lineBreakString}`; return outText; } //////////////////////////////////////////////////////////////////////////////////////////////////////// // Patterns //////////////////////////////////////////////////////////////////////////////////////////////////////// const timeRangeWithHoursRegExp = buildRegExp([ inputStart, capture(oneOrMore(digit)), ':', capture(oneOrMore(digit)), ':', capture(oneOrMore(digit)), anyOf('.', ','), capture(oneOrMore(digit)), zeroOrMore(' '), '-->', zeroOrMore(' '), capture(oneOrMore(digit)), ':', capture(oneOrMore(digit)), ':', capture(oneOrMore(digit)), anyOf('.', ','), capture(oneOrMore(digit)), ]); const timeRangeWithoutHoursRegExp = buildRegExp([ inputStart, capture(oneOrMore(digit)), ':', capture(oneOrMore(digit)), anyOf('.', ','), capture(oneOrMore(digit)), zeroOrMore(' '), '-->', zeroOrMore(' '), capture(oneOrMore(digit)), ':', capture(oneOrMore(digit)), anyOf('.', ','), capture(oneOrMore(digit)), ]); const phraseSeparatorCharacters = [',', ',', '、', ';', ':', '),', '",', '”,']; const phraseSeparatorRegExp = buildRegExp(matches([ inputStart, anyOf(...phraseSeparatorCharacters) ], { ifPrecededBy: notUnicodeProperty('Decimal_Number'), ifFollowedBy: notUnicodeProperty('Decimal_Number'), })); export const defaultSubtitlesBaseConfig = { format: 'srt', mode: 'sentence', maxLineCount: 2, maxLineWidth: 42, minWordsInLine: 4, separatePhrases: true, maxAddedDuration: 3.0, }; export const srtConfigExtension = { decimalSeparator: ',', includeCueIndexes: true, includeHours: true, lineBreakString: '\n', }; export const webVttConfigExtension = { decimalSeparator: '.', includeCueIndexes: false, includeHours: true, lineBreakString: '\n', }; //# sourceMappingURL=Subtitles.js.map