echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
630 lines (477 loc) • 17.3 kB
text/typescript
import { htmlToText } from 'html-to-text'
import { secondsToHMS, secondsToMS } from '../utilities/Utilities.js'
import { isWordOrSymbolWord } from '../nlp/Segmentation.js'
import { charactersToWriteAhead } from '../audio/AudioPlayer.js'
import { Timeline, TimelineEntry } from '../utilities/Timeline.js'
import { readFileAsUtf8 } from '../utilities/FileSystem.js'
import { deepClone } from '../utilities/ObjectUtilities.js'
import { formatHMS, formatMS } from '../utilities/StringUtilities.js'
import { anyOf, buildRegExp, capture, digit, inputStart, matches, notUnicodeProperty, oneOrMore, zeroOrMore } from 'regexp-composer'
export async function subtitlesFileToText(filename: string) {
return subtitlesToText(await readFileAsUtf8(filename))
}
export function subtitlesToText(subtitles: string) {
return subtitlesToTimeline(subtitles, true).map(entry => entry.text).join(' ')
}
export function subtitlesToTimeline(subtitles: string, removeMarkup = true) {
const lines = subtitles.split(/\r?\n/)
const timeline: Timeline = []
let isWithinCue = false
// Parse lines of subtitles text
for (let line of lines) {
line = line.trim()
if (line.length == 0) {
isWithinCue = false
continue
}
let result = tryParseTimeRangePatternWithHours(line)
if (!result.succeeded) {
result = tryParseTimeRangePatternWithoutHours(line)
}
if (result.succeeded) {
timeline.push({
type: 'segment',
startTime: result.startTime,
endTime: result.endTime,
text: ''
})
isWithinCue = true
} else if (isWithinCue && timeline.length > 0) {
const lastEntry = timeline[timeline.length - 1]
if (lastEntry.text == '') {
lastEntry.text = line
} else {
lastEntry.text += ' ' + line
}
}
}
if (!removeMarkup) {
return timeline
}
// Remove markup in each entry text
const timelineWithoutMarkup = timeline.map((entry) => {
let plainText: string = entry.text
plainText = plainText.replaceAll(/<[^>]*>/g, '')
plainText = htmlToText(plainText, { wordwrap: false })
plainText = plainText.replaceAll(/\s+/g, ' ').trim()
return { ...entry, text: plainText }
})
return timelineWithoutMarkup
}
export function timelineToSubtitles(timeline: Timeline, subtitlesConfig?: SubtitlesConfig) {
// Prepare subtitle configuration
timeline = deepClone(timeline)
let config = subtitlesConfig || {}
if (config.format && config.format == 'webvtt') {
config = { ...defaultSubtitlesBaseConfig, ...webVttConfigExtension, ...config }
} else {
config = { ...defaultSubtitlesBaseConfig, ...srtConfigExtension, ...config }
}
// Initialize subtitle file content
const lineBreakString = config.lineBreakString
let outText = ''
if (config.format == 'webvtt') {
outText += `WEBVTT${lineBreakString}Kind: captions${lineBreakString}`
if (config.language) {
outText += `Language: ${config.language}${lineBreakString}`
}
outText += lineBreakString
}
// Generate the cues from the given timeline
let cues: Cue[]
if (config.mode == 'segment' || config.mode == 'sentence') {
cues = getCuesFromTimeline_IsolateSegmentSentence(timeline, config)
} else if (config.mode == 'word' || config.mode == 'phone' || config.mode == 'word+phone') {
cues = getCuesFromTimeline_IsolateWordPhone(timeline, config)
} else if (config.mode == 'line') {
cues = getCuesFromTimeline_IsolateLines(timeline, config)
} else {
throw new Error('Invalid subtitles mode.')
}
// Extend cue end times with maximum added duration, if possible
if (cues.length > 0 &&
config.maxAddedDuration! > 0 &&
(config.mode === 'segment' || config.mode === 'sentence' || config.mode === 'line')) {
for (let i = 1; i < cues.length; i++) {
const currentCue = cues[i]
const previousCue = cues[i - 1]
previousCue.endTime = Math.min(previousCue.endTime + config.maxAddedDuration!, currentCue.startTime)
}
if (config.totalDuration != null) {
const lastCue = cues[cues.length - 1]
lastCue.endTime = Math.min(lastCue.endTime + config.maxAddedDuration!, config.totalDuration)
}
}
// Write cues to output text
for (let cueIndex = 0; cueIndex < cues.length; cueIndex++) {
outText += cueObjectToText(cues[cueIndex], cueIndex + 1, config)
}
return outText
}
// Generates subtitle cues from timeline. Ensures each segment or sentence starts in a new cue.
function getCuesFromTimeline_IsolateSegmentSentence(timeline: Timeline, config: SubtitlesConfig) {
if (timeline.length == 0) {
return []
}
// If the given timeline is a word timeline, wrap it with a segment and call again
if (timeline[0].type == 'word') {
const wordTimeline = timeline.filter(entry => isWordOrSymbolWord(entry.text))
const text = wordTimeline.map(entry => entry.text).join(' ')
const segmentEntry: TimelineEntry = {
type: 'segment',
text: text,
startTime: wordTimeline[0].startTime,
endTime: wordTimeline[wordTimeline.length - 1].endTime,
timeline: wordTimeline
}
return getCuesFromTimeline_IsolateSegmentSentence([segmentEntry], config)
}
const cues: Cue[] = []
// Generate one or more cues from each segment or sentence in the timeline.
for (let entry of timeline) {
if (entry.type == 'segment' && entry.timeline?.[0]?.type == 'sentence') {
if (config.mode == 'segment') {
// If the mode is 'segment', flatten all sentences to a single word timeline
entry.timeline = entry.timeline!.flatMap(t => t.timeline!)
} else {
cues.push(...getCuesFromTimeline_IsolateSegmentSentence(entry.timeline!, config))
continue
}
}
const entryText = entry.text
const maxLineWidth = config.maxLineWidth!
if (entryText.length <= maxLineWidth) {
cues.push({
lines: [entryText],
startTime: entry.startTime,
endTime: entry.endTime
})
continue
}
if (!entry.timeline || entry.timeline?.[0]?.type != 'word') {
continue
}
const wordTimeline = entry.timeline!.filter(entry => isWordOrSymbolWord(entry.text))
// First, add word start and end offsets for all word entries
let lastWordEndOffset = 0
for (const wordEntry of wordTimeline) {
const wordStartOffset = entryText.indexOf(wordEntry.text, lastWordEndOffset)
if (wordStartOffset == -1) {
throw new Error(`Couldn't find word '${wordEntry.text}' in its parent entry text`)
}
let wordEndOffset = wordStartOffset + wordEntry.text.length
lastWordEndOffset = wordEndOffset
wordEntry.startOffsetUtf16 = wordStartOffset
wordEntry.endOffsetUtf16 = wordEndOffset
}
// Add cues
let currentCue: Cue = {
lines: [],
startTime: -1,
endTime: -1
}
let lineStartWordOffset = 0
let lineStartOffset = 0
for (let wordIndex = 0; wordIndex < wordTimeline.length; wordIndex++) {
const isLastWord = wordIndex == wordTimeline.length - 1
const wordEntry = wordTimeline[wordIndex]
const wordEndOffset = wordEntry.endOffsetUtf16!
function getExtendedEndOffset(offset: number | undefined) {
if (offset == undefined) {
return entryText.length
}
while (charactersToWriteAhead.includes(entryText[offset])) {
offset += 1
}
return offset
}
const wordExtendedEndOffset = getExtendedEndOffset(wordEndOffset)
const nextWordEntry = wordTimeline[wordIndex + 1]
const nextWordExtendedEndOffset = getExtendedEndOffset(nextWordEntry?.endOffsetUtf16)
// Decide if to add to a new line
const lineLength = wordExtendedEndOffset - lineStartOffset
const lineLengthWithNextWord = nextWordExtendedEndOffset - lineStartOffset
const wordsRemaining = wordTimeline.length - wordIndex - 1
const lineLengthWithNextWordExceedsMaxLineWidth = lineLengthWithNextWord >= maxLineWidth
const lineLengthExceedsHalfMaxLineWidth = lineLength >= maxLineWidth / 2
const wordsRemainingAreEqualOrLessToMinimumWordsInLine = wordsRemaining <= config.minWordsInLine!
const remainingTextExceedsMaxLineWidth = entryText.length - lineStartOffset > maxLineWidth
const followingSubstringIsPhraseSeparator = phraseSeparatorRegExp.test(entryText.substring(wordEndOffset))
const shouldAddNewLine =
isLastWord ||
lineLengthWithNextWordExceedsMaxLineWidth ||
(remainingTextExceedsMaxLineWidth &&
lineLengthExceedsHalfMaxLineWidth &&
(wordsRemainingAreEqualOrLessToMinimumWordsInLine || (config.separatePhrases && followingSubstringIsPhraseSeparator)))
// If it was decided to add a new line
if (shouldAddNewLine) {
// Extend line end offset to end of sentence entry if last word encountered
let lineEndOffset: number
if (isLastWord) {
lineEndOffset = entryText.length
} else {
lineEndOffset = wordExtendedEndOffset
}
// Get line text
const lineText = entryText.substring(lineStartOffset, lineEndOffset)
// Find start and end times of line
const nextWordStartTime = isLastWord ? entry.endTime : wordTimeline[wordIndex + 1].startTime
const lineStartTime = wordTimeline[lineStartWordOffset].startTime
const lineEndTime = nextWordStartTime
// Add new line to cue
currentCue.lines.push(lineText)
// Update cue start and end times
if (currentCue.startTime == -1) {
currentCue.startTime = lineStartTime
}
currentCue.endTime = lineEndTime
// Finalize cue if needed
if (isLastWord || currentCue.lines.length == config.maxLineCount) {
cues.push(currentCue)
currentCue = {
lines: [],
startTime: -1,
endTime: -1
}
}
// Update offsets
lineStartOffset = lineEndOffset
lineStartWordOffset = wordIndex + 1
}
}
}
return cues
}
// Generates cues from timeline. Isolates words or phones in individual cues.
function getCuesFromTimeline_IsolateWordPhone(timeline: Timeline, config: SubtitlesConfig) {
if (timeline.length == 0) {
return []
}
const mode = config.mode!
const cues: Cue[] = []
for (const entry of timeline) {
const entryIsWord = entry.type == 'word'
const entryIsPhone = entry.type == 'phone'
const shouldIncludeEntry =
(entryIsWord && (mode == 'word' || mode == 'word+phone')) ||
(entryIsPhone && (mode == 'phone' || mode == 'word+phone'))
if (shouldIncludeEntry) {
cues.push({
lines: [entry.text],
startTime: entry.startTime,
endTime: entry.endTime,
})
}
if (entry.timeline) {
cues.push(...getCuesFromTimeline_IsolateWordPhone(entry.timeline, config))
}
}
return cues
}
// Generates cues from timeline. Isolates lines in individual cues.
function getCuesFromTimeline_IsolateLines(timeline: Timeline, config: SubtitlesConfig) {
if (timeline.length == 0) {
return []
}
const originalText = config.originalText
if (originalText == null) {
throw new Error(`'line' subtitles mode requires passing the original text in the 'originalText' property of the configuration object.`)
}
const lines = originalText.split(/(\r?\n)/g)
const charOffsetToLineNumber: number[] = []
for (let lineNumber = 0; lineNumber < lines.length; lineNumber++) {
const line = lines[lineNumber]
for (let i = 0; i < line.length; i++) {
charOffsetToLineNumber.push(lineNumber)
}
}
const cues: Cue[] = []
let currentCueWords: Timeline = []
function addCueFromCurrentWords() {
if (currentCueWords.length == 0) {
return
}
const firstWordEntry = currentCueWords[0]
const lastWordEntry = currentCueWords[currentCueWords.length - 1]
const lineNumber = charOffsetToLineNumber[firstWordEntry.startOffsetUtf16!]
const line = lines[lineNumber].trim()
cues.push({
lines: [line],
startTime: firstWordEntry.startTime,
endTime: lastWordEntry.endTime
})
currentCueWords = []
}
function addCuesFrom(timeline: Timeline) {
for (const entry of timeline) {
if (entry.type == 'word') {
const currentWordLineNumber = charOffsetToLineNumber[entry.startOffsetUtf16!]
const previousWordEntry = currentCueWords[currentCueWords.length - 1]
if (previousWordEntry) {
const previousWordLineNumber = charOffsetToLineNumber[previousWordEntry.startOffsetUtf16!]
if (currentWordLineNumber > previousWordLineNumber) {
addCueFromCurrentWords()
}
}
currentCueWords.push(entry)
} else if (entry.timeline) {
addCuesFrom(entry.timeline)
}
}
}
addCuesFrom(timeline)
addCueFromCurrentWords() // Add any remaining words
return cues
}
export function tryParseTimeRangePatternWithHours(line: string) {
const match = timeRangeWithHoursRegExp.exec(line)
if (!match) {
return { startTime: -1, endTime: -1, succeeded: false }
}
const startHours = parseInt(match[1])
const startMinutes = parseInt(match[2])
const startSeconds = parseInt(match[3])
const startMilliseconds = parseInt(match[4])
const endHours = parseInt(match[5])
const endMinutes = parseInt(match[6])
const endSeconds = parseInt(match[7])
const endMilliseconds = parseInt(match[8])
const startTime = (startMilliseconds / 1000) + (startSeconds) + (startMinutes * 60) + (startHours * 60 * 60)
const endTime = (endMilliseconds / 1000) + (endSeconds) + (endMinutes * 60) + (endHours * 60 * 60)
return { startTime, endTime, succeeded: true }
}
export function tryParseTimeRangePatternWithoutHours(line: string) {
const match = timeRangeWithoutHoursRegExp.exec(line)
if (!match) {
return { startTime: -1, endTime: -1, succeeded: false }
}
const startMinutes = parseInt(match[1])
const startSeconds = parseInt(match[2])
const startMilliseconds = parseInt(match[3])
const endMinutes = parseInt(match[4])
const endSeconds = parseInt(match[5])
const endMilliseconds = parseInt(match[6])
const startTime = (startMilliseconds / 1000) + (startSeconds) + (startMinutes * 60)
const endTime = (endMilliseconds / 1000) + (endSeconds) + (endMinutes * 60)
return { startTime, endTime, succeeded: true }
}
function cueObjectToText(cue: Cue, cueIndex: number, config: SubtitlesConfig) {
if (!cue || !cue.lines || cue.lines.length == 0) {
throw new Error(`Cue is empty`)
}
const lineBreakString = config.lineBreakString
let outText = ''
if (config.includeCueIndexes) {
outText += `${cueIndex}${lineBreakString}`
}
let formattedStartTime: string
let formattedEndTime: string
if (config.includeHours == true) {
formattedStartTime = formatHMS(secondsToHMS(cue.startTime), config.decimalSeparator)
formattedEndTime = formatHMS(secondsToHMS(cue.endTime), config.decimalSeparator)
} else {
formattedStartTime = formatMS(secondsToMS(cue.startTime), config.decimalSeparator)
formattedEndTime = formatMS(secondsToMS(cue.endTime), config.decimalSeparator)
}
outText += `${formattedStartTime} --> ${formattedEndTime}`
outText += `${lineBreakString}`
outText += cue.lines.map(line => line.trim()).join(lineBreakString)
outText += `${lineBreakString}`
outText += `${lineBreakString}`
return outText
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Patterns
////////////////////////////////////////////////////////////////////////////////////////////////////////
const timeRangeWithHoursRegExp = buildRegExp([
inputStart,
capture(oneOrMore(digit)),
':',
capture(oneOrMore(digit)),
':',
capture(oneOrMore(digit)),
anyOf('.', ','),
capture(oneOrMore(digit)),
zeroOrMore(' '),
'-->',
zeroOrMore(' '),
capture(oneOrMore(digit)),
':',
capture(oneOrMore(digit)),
':',
capture(oneOrMore(digit)),
anyOf('.', ','),
capture(oneOrMore(digit)),
])
const timeRangeWithoutHoursRegExp = buildRegExp([
inputStart,
capture(oneOrMore(digit)),
':',
capture(oneOrMore(digit)),
anyOf('.', ','),
capture(oneOrMore(digit)),
zeroOrMore(' '),
'-->',
zeroOrMore(' '),
capture(oneOrMore(digit)),
':',
capture(oneOrMore(digit)),
anyOf('.', ','),
capture(oneOrMore(digit)),
])
const phraseSeparatorCharacters = [',', ',', '、', ';', ':', '),', '",', '”,']
const phraseSeparatorRegExp = buildRegExp(
matches([
inputStart,
anyOf(...phraseSeparatorCharacters)
], {
ifPrecededBy: notUnicodeProperty('Decimal_Number'),
ifFollowedBy: notUnicodeProperty('Decimal_Number'),
})
)
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Types
////////////////////////////////////////////////////////////////////////////////////////////////////////
export type Cue = {
lines: string[]
startTime: number
endTime: number
}
export type SubtitlesMode = 'line' | 'segment' | 'sentence' | 'word' | 'phone' | 'word+phone'
export interface SubtitlesConfig {
format?: 'srt' | 'webvtt'
language?: string
mode?: SubtitlesMode
maxLineCount?: number
maxLineWidth?: number
minWordsInLine?: number
separatePhrases?: boolean
maxAddedDuration?: number
decimalSeparator?: ',' | '.'
includeCueIndexes?: boolean
includeHours?: boolean
lineBreakString?: '\n' | '\r\n'
originalText?: string
totalDuration?: number
}
export const defaultSubtitlesBaseConfig: SubtitlesConfig = {
format: 'srt',
mode: 'sentence',
maxLineCount: 2,
maxLineWidth: 42,
minWordsInLine: 4,
separatePhrases: true,
maxAddedDuration: 3.0,
}
export const srtConfigExtension: SubtitlesConfig = {
decimalSeparator: ',',
includeCueIndexes: true,
includeHours: true,
lineBreakString: '\n',
}
export const webVttConfigExtension: SubtitlesConfig = {
decimalSeparator: '.',
includeCueIndexes: false,
includeHours: true,
lineBreakString: '\n',
}