echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
215 lines (157 loc) • 5.59 kB
text/typescript
import { TypedArray } from '../typings/TypedArray.js'
export function includesAnyOf(str: string, substrings: string[]) {
return indexOfAnyOf(str, substrings) >= 0
}
export function indexOfAnyOf(str: string, substrings: string[]) {
for (const substring of substrings) {
const index = str.indexOf(substring)
if (index >= 0) {
return index
}
}
return -1
}
export function startsWithAnyOf(str: string, prefixes: string[]) {
for (const prefix of prefixes) {
if (str.startsWith(prefix)) {
return true
}
}
return false
}
export function formatHMS(timeHMS: { hours: number, minutes: number, seconds: number, milliseconds: number }, decimalSeparator = '.') {
return `${formatIntegerWithLeadingZeros(timeHMS.hours, 2)}:${formatIntegerWithLeadingZeros(timeHMS.minutes, 2)}:${formatIntegerWithLeadingZeros(timeHMS.seconds, 2)}${decimalSeparator}${formatIntegerWithLeadingZeros(timeHMS.milliseconds, 3)}`
}
export function formatMS(timeMS: { minutes: number, seconds: number, milliseconds: number }, decimalSeparator = '.') {
return `${formatIntegerWithLeadingZeros(timeMS.minutes, 2)}:${formatIntegerWithLeadingZeros(timeMS.seconds, 2)}${decimalSeparator}${formatIntegerWithLeadingZeros(timeMS.milliseconds, 3)}`
}
export function formatIntegerWithLeadingZeros(num: number, minDigitCount: number) {
num = Math.floor(num)
let numAsString = `${num}`
while (numAsString.length < minDigitCount) {
numAsString = `0${numAsString}`
}
return numAsString
}
export function formatListWithQuotedElements(strings: string[], quoteSymbol = `'`) {
return strings.map(str => `${quoteSymbol}${str}${quoteSymbol}`).join(', ')
}
export function getUTF32Chars(str: string) {
const utf32chars: string[] = []
const utf16To32Mapping: number[] = []
let utf32Index = 0
for (const utf32char of str) {
utf32chars.push(utf32char)
for (let i = 0; i < utf32char.length; i++) {
utf16To32Mapping.push(utf32Index)
}
utf32Index += 1
}
utf16To32Mapping.push(utf32Index)
return { utf32chars, utf16To32Mapping }
}
export function containsInvalidCodepoint(str: string) {
for (const char of str) {
if (char.codePointAt(0) === 65533) {
return true
}
}
return false
}
export function splitAndPreserveSeparators(text: string, separatorRegex: RegExp): string[] {
if (!separatorRegex.flags.includes('g')) {
throw new Error('Separator regular expression must have a global flag')
}
// Use the match method to find all matches for the separators
const matches = text.match(separatorRegex)
// If no matches are found, return the original text as a single element array
if (!matches) {
return [text]
}
// Initialize the result array
const result: string[] = []
// Initialize the start position
let lastIndex = 0
// Iterate through the matches
matches.forEach(match => {
// Get the index of the current match
const matchIndex = text.indexOf(match, lastIndex)
// Add the substring before the match to the result, joined with the match itself
result.push(text.substring(lastIndex, matchIndex) + match)
// Update the last index to the end of the current match
lastIndex = matchIndex + match.length
})
// Add the remaining substring after the last match to the result
{
const remainingText = text.substring(lastIndex)
if (remainingText.length > 0) {
result.push(remainingText)
}
}
return result
}
export function getTokenRepetitionScore(tokens: string[] | number[]) {
const maxCycleLength = Math.floor(tokens.length / 2)
const matchLengthForCycleLength: number[] = [0]
for (let cycleLength = 1; cycleLength <= maxCycleLength; cycleLength++) {
let matchCount = 0
for (let leftIndex = cycleLength; leftIndex < tokens.length; leftIndex++) {
const referenceIndex = leftIndex - cycleLength
if (tokens[leftIndex] !== tokens[referenceIndex]) {
break
}
matchCount += 1
}
const score = matchCount
matchLengthForCycleLength.push(score)
}
let longestMatchLength = -Infinity
let longestCycleRepetitionCount = -Infinity
for (let i = 1; i <= matchLengthForCycleLength.length; i++) {
const matchLength = matchLengthForCycleLength[i]
if (matchLength > longestMatchLength) {
longestMatchLength = matchLength
}
const cycleCount = (matchLength / i) + 1
if (cycleCount > longestCycleRepetitionCount) {
longestCycleRepetitionCount = cycleCount
}
}
return { longestMatchLength, longestCycleRepetitionCount }
}
export async function convertHtmlToText(html: string) {
const { htmlToText } = await import('html-to-text')
const text = htmlToText(html, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
{ selector: 'h1', options: { uppercase: false } },
{ selector: 'h2', options: { uppercase: false } },
{ selector: 'h3', options: { uppercase: false } },
{ selector: 'h4', options: { uppercase: false } },
{ selector: 'table', options: { uppercaseHeaderCells: false } }
]
})
return text || ''
}
export function substituteCharactersUsingLookup(text: string, substitutionLookup: Record<string, string>) {
let resultText = ''
for (const char of text) {
const substitution = substitutionLookup[char]
if (substitution !== undefined) {
resultText += substitution
} else {
resultText += char
}
}
return resultText
}
export function substituteStringUsingLookup(text: string, substitutionLookup: Record<string, string>) {
const substitution = substitutionLookup[text]
if (substitution !== undefined) {
return substitution
} else {
return text
}
}