echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
222 lines • 9.6 kB
JavaScript
import { sumArray, logToStderr } from '../utilities/Utilities.js';
import { getShortLanguageCode } from '../utilities/Locale.js';
import { includesAnyOf, splitAndPreserveSeparators } from '../utilities/StringUtilities.js';
import * as TextSegmentation from '@echogarden/text-segmentation';
import { splitChineseTextToWords_Jieba } from './ChineseSegmentation.js';
import { splitJapaneseTextToWords_Kuromoji } from './JapaneseSegmentation.js';
const log = logToStderr;
export const wordCharacterRegExp = /[\p{Letter}\p{Number}]/u;
// See: https://mathiasbynens.be/notes/es-unicode-property-escapes
export const emojiSequenceRegExp = /\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?|\p{Emoji_Presentation}|\p{Emoji}\uFE0F/u;
export const punctuationRegExp = /[\p{Punctuation}]/u;
export const phraseSeparators = [',', '、', ',', '،', ';', ';', ':', ':', '—'];
export const symbolWords = ['$', '€', '¢', '£', '¥', '©', '®', '™', '%', '&', '#', '~', '@', '+', '±', '÷', '/', '\\', '^', '*', '×', '=', '¼', '½', '¾'];
///////////////////////////////////////////////////////////////////////////////////////////////
// Predicates
///////////////////////////////////////////////////////////////////////////////////////////////
export function isWordOrSymbolWord(str) {
return isWord(str) || includesEmoji(str) || symbolWords.includes(str);
}
export function isSymbolWord(str) {
return symbolWords.includes(str?.trim());
}
export function isWord(str) {
return wordCharacterRegExp.test(str?.trim());
}
export function includesPunctuation(str) {
return punctuationRegExp.test(str?.trim());
}
export function includesEmoji(str) {
return emojiSequenceRegExp.test(str?.trim());
}
export function isWhitespace(str) {
return str?.trim().length === 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Paragraph, line, sentence, phrase, and word segmentation
///////////////////////////////////////////////////////////////////////////////////////////////
export function splitToParagraphs(text, paragraphBreakType, whitespaceProcessingMethod) {
let paragraphs = [];
if (paragraphBreakType === 'single') {
paragraphs = splitAndPreserveSeparators(text, /(\r?\n)+/g);
}
else if (paragraphBreakType === 'double') {
paragraphs = splitAndPreserveSeparators(text, /(\r?\n)(\r?\n)+/g);
}
else {
throw new Error(`Invalid paragraph break type: '${paragraphBreakType}'`);
}
paragraphs = paragraphs.map(p => applyWhitespaceProcessing(p, whitespaceProcessingMethod));
paragraphs = paragraphs.filter(p => p.length > 0);
return paragraphs;
}
export function splitToLines(text) {
return splitAndPreserveSeparators(text, /\r?\n/g);
}
export async function parseText(text, langCode) {
const shortLangCode = getShortLanguageCode(langCode || '');
const wordSequence = await splitToWords(text, shortLangCode);
const parsedText = await TextSegmentation.segmentWordSequence(wordSequence);
return parsedText;
}
export async function splitToWords(text, langCode) {
const shortLangCode = getShortLanguageCode(langCode || '');
if (shortLangCode === 'zh' || shortLangCode === 'cmn' || shortLangCode === 'ja') {
let wordArray = [];
if (shortLangCode === 'zh' || shortLangCode === 'cmn') {
wordArray = await splitChineseTextToWords_Jieba(text);
}
else {
wordArray = await splitJapaneseTextToWords_Kuromoji(text);
}
const wordSequence = new TextSegmentation.WordSequence();
let offset = 0;
for (const wordText of wordArray) {
const startOffset = offset;
const endOffset = startOffset + wordText.length;
const isPunctuation = !isWordOrSymbolWord(wordText);
wordSequence.addWord(wordText, startOffset, isPunctuation);
offset = endOffset;
}
return wordSequence;
}
else {
return TextSegmentation.splitToWords(text, { language: langCode });
}
}
export function applyWhitespaceProcessing(text, whitespaceProcessingMethod) {
if (whitespaceProcessingMethod === 'removeLineBreaks') {
return text.trim().replaceAll(/(\r?\n)+/g, ' ');
}
else if (whitespaceProcessingMethod === 'collapse') {
return text.trim().replaceAll(/\s+/g, ' ');
}
else if (whitespaceProcessingMethod === 'preserve') {
return text;
}
else {
throw new Error(`Invalid whitespace processing method: '${whitespaceProcessingMethod}'`);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////
// Fragment segmentation
//
// Used to split text to fragments, to fit particular size constraints.
///////////////////////////////////////////////////////////////////////////////////////////////
export async function splitToFragments(text, maxFragmentLength, langCode, preserveSentences = true, preservePhrases = true) {
const parsedText = await parseTextAndConvertToFragmentObjects(text, langCode);
const fragments = [];
let currentFragment = new Fragment();
const remainingCharactersInCurrentFragment = () => maxFragmentLength - currentFragment.length;
const createNewFragmentIfNeeded = () => {
if (currentFragment.isNonempty) {
fragments.push(currentFragment);
currentFragment = new Fragment();
}
};
const fitsCurrentFragment = (segment) => segment.length <= remainingCharactersInCurrentFragment();
for (const sentence of parsedText) {
if (fitsCurrentFragment(sentence)) {
currentFragment.segments.push(sentence);
continue;
}
if (preserveSentences) {
createNewFragmentIfNeeded();
if (fitsCurrentFragment(sentence)) {
currentFragment.segments.push(sentence);
continue;
}
}
for (const phrase of sentence.phrases) {
if (fitsCurrentFragment(phrase)) {
currentFragment.segments.push(phrase);
continue;
}
if (preservePhrases) {
createNewFragmentIfNeeded();
if (fitsCurrentFragment(phrase)) {
currentFragment.segments.push(phrase);
continue;
}
}
for (const word of phrase.words) {
if (fitsCurrentFragment(word)) {
currentFragment.segments.push(word);
continue;
}
createNewFragmentIfNeeded();
if (fitsCurrentFragment(word)) {
currentFragment.segments.push(word);
continue;
}
throw new Error(`Encountered a word of length ${word.length}, which excceeds the maximum fragment length of ${maxFragmentLength}`);
}
}
}
createNewFragmentIfNeeded();
return fragments;
}
export async function parseTextAndConvertToFragmentObjects(text, langCode) {
const segmentedText = await parseText(text, langCode);
const sentences = [];
for (const sentenceEntry of segmentedText.sentences) {
const sentence = new Sentence();
for (const phraseEntry of sentenceEntry.phrases) {
const phrase = new Phrase();
for (const wordEntry of phraseEntry.words.entries) {
const isSentenceFinalizer = wordEntry === sentenceEntry.words.lastEntry;
const word = new Word(wordEntry.text, isSentenceFinalizer);
phrase.words.push(word);
}
if (phrase.words.length > 0) {
sentence.phrases.push(phrase);
}
}
sentences.push(sentence);
}
return sentences;
}
export class Sentence {
phrases = [];
isSentenceFinalizer = true;
get length() { return sumArray(this.phrases, (phrase) => phrase.length); }
get text() { return this.phrases.reduce((result, phrase) => result + phrase.text, ''); }
}
export class Phrase {
words = [];
get length() { return sumArray(this.words, (word) => word.length); }
get text() { return this.words.reduce((result, word) => result + word.text, ''); }
get lastWord() {
if (this.words.length == 0) {
return undefined;
}
return this.words[this.words.length - 1];
}
get isSentenceFinalizer() { return this.lastWord != null ? this.lastWord.isSentenceFinalizer : false; }
}
export class Word {
text;
isSentenceFinalizer;
constructor(text, isSentenceFinalizer) {
this.text = text;
this.isSentenceFinalizer = isSentenceFinalizer;
}
get containsOnlyPunctuation() { return !wordCharacterRegExp.test(this.text) && !this.isSymbolWord; }
get isSymbolWord() { return symbolWords.includes(this.text); }
get isPhraseSeperator() { return this.containsOnlyPunctuation && includesAnyOf(this.text, phraseSeparators); }
get length() { return this.text.length; }
}
export class Fragment {
segments = [];
get length() { return sumArray(this.segments, (phrase) => phrase.length); }
get text() { return this.segments.reduce((result, segment) => result + segment.text, ''); }
get isEmpty() { return this.length == 0; }
get isNonempty() { return !this.isEmpty; }
get lastSegment() {
if (this.isEmpty) {
return undefined;
}
return this.segments[this.segments.length - 1];
}
}
//# sourceMappingURL=Segmentation.js.map