echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
169 lines (129 loc) • 5.14 kB
text/typescript
import * as EspeakTTS from '../synthesis/EspeakTTS.js'
import { logToStderr } from '../utilities/Utilities.js'
import * as Segmentation from './Segmentation.js'
const log = logToStderr
export async function phonemizeSentence(sentence: string, espeakVoice: string, substitutionMap?: Map<string, string[]>, useIpa = true) {
const ipaString = await EspeakTTS.textToPhonemes(sentence, espeakVoice, useIpa)
const phraseStrings = ipaString.split(' | ')
const phrases: string[][][] = []
for (let phraseIndex = 0; phraseIndex < phraseStrings.length; phraseIndex++) {
const phraseString = phraseStrings[phraseIndex]
const wordStrings = phraseString.trim().split(/ +/g)
const words: string[][] = []
for (let wordIndex = 0; wordIndex < wordStrings.length; wordIndex++) {
const word = wordStrings[wordIndex]
let wordPhonemes = word.split('_')
wordPhonemes = wordPhonemes.flatMap(phoneme => {
if (!phoneme || phoneme.startsWith('(')) {
return []
} else if (phoneme.startsWith(`ˈ`) || phoneme.startsWith(`ˌ`)) {
return [phoneme[0], phoneme.substring(1)]
} else if (phoneme.endsWith(`ˈ`) || phoneme.endsWith(`ˌ`)) {
return [phoneme.substring(0, phoneme.length - 1), phoneme[phoneme.length - 1]]
} else {
return substitutionMap?.get(phoneme) || [phoneme]
}
})
if (wordPhonemes.length > 0) {
words.push(wordPhonemes)
}
}
if (words.length > 0) {
phrases.push(words)
}
}
return phrases
}
export async function phonemizeText(text: string, voice: string, substitutionMap?: Map<string, string[]>) {
text = text
.replaceAll(',', ',')
.replaceAll('、', ',')
.replaceAll('。', '.')
.replaceAll('(', ', ')
.replaceAll(')', ', ')
.replaceAll('«', ', ')
.replaceAll('»', ', ')
const segmentedText = await Segmentation.parseTextAndConvertToFragmentObjects(text, voice)
const preparedPhrases: string[] = []
const phraseBreakers: string[] = []
for (const sentence of segmentedText) {
for (const phrase of sentence.phrases) {
const words = phrase.words.filter(wordObject => Segmentation.isWordOrSymbolWord(wordObject.text))
const preparedPhraseText = words.map(word => word.text.replace(/\./g, ' ')).join(' ')
preparedPhrases.push(preparedPhraseText)
const trimmedPhraseText = phrase.text.trim()
const lastChar = trimmedPhraseText[trimmedPhraseText.length - 1]
if (phrase.isSentenceFinalizer) {
if (trimmedPhraseText.endsWith('?') || trimmedPhraseText.endsWith(`?"`)) {
phraseBreakers.push('?')
} else if (trimmedPhraseText.endsWith('!') || trimmedPhraseText.endsWith(`!"`)) {
phraseBreakers.push('!')
} else {
phraseBreakers.push('.')
}
} else {
if (lastChar == ':' || lastChar == ';') {
phraseBreakers.push(lastChar)
} else {
phraseBreakers.push(',')
}
}
}
}
return phonemizePhrases(preparedPhrases, voice, phraseBreakers, substitutionMap)
}
export async function phonemizePhrases(phrases: string[], voice: string, phraseBreakers: string[], substitutionMap?: Map<string, string[]>) {
if (phrases.length == 0) {
return []
}
const preparedText = phrases.join('\n\n') // filter(phrase => phrase.trim().length > 0)
const ipaString = await EspeakTTS.textToIPA(preparedText, voice)
const ipaLines = ipaString.split('\n')
const phonemeLines = ipaLines.map(line => {
line = line.replace(/_+/g, '_').replace(/ +/g, ' ')
return line.split(' ').map(word => {
word = word.replaceAll('_', ' ').trim()
let wordPhonemes = word.split(' ')
wordPhonemes = wordPhonemes.flatMap(phoneme => {
if (!phoneme || phoneme.startsWith('(')) {
return []
} else if (phoneme.startsWith('ˈ') || phoneme.startsWith('ˌ')) {
return [phoneme[0], phoneme.substring(1)]
} else if (phoneme.endsWith('ˈ') || phoneme.endsWith('ˌ')) {
return [phoneme.substring(0, phoneme.length - 1), phoneme[phoneme.length - 1]]
} else {
return [phoneme]
}
})
if (substitutionMap) {
wordPhonemes = wordPhonemes.flatMap(phoneme => substitutionMap.get(phoneme) || [phoneme])
}
return wordPhonemes
})
})
if (ipaLines.length != phraseBreakers.length) {
log(phrases)
log(ipaLines)
log(phraseBreakers)
throw new Error(`Unexpected: IPA lines count (${ipaLines.length}) is not equal to phrase breakers count (${phraseBreakers.length})`)
}
for (let i = 0; i < phonemeLines.length; i++) {
const line = phonemeLines[i]
const lastWordInLine = line[line.length - 1]
lastWordInLine.push(phraseBreakers[i])
}
return phonemeLines
}
export function phonemizedPhrasesToSentences(phonemizedPhrases: string[][][]) {
let phonemizedSentences: string[][][] = [[]]
for (const phonemizedPhrase of phonemizedPhrases) {
phonemizedSentences[phonemizedSentences.length - 1].push(...phonemizedPhrase)
const lastWord = phonemizedPhrase[phonemizedPhrase.length - 1]
const lastPhoneme = lastWord[lastWord.length - 1]
if (['.', '?', '!'].includes(lastPhoneme)) {
phonemizedSentences.push([])
}
}
phonemizedSentences = phonemizedSentences.filter(entry => entry.length > 0)
return phonemizedSentences
}