echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
71 lines (51 loc) • 1.75 kB
text/typescript
export async function splitChineseTextToWords_Jieba(text: string, fineGrained = false, useHMM = true) {
const jieba = await getJiebaWasmInstance()
if (!fineGrained) {
return jieba.cut(text, useHMM)
} else {
const results = jieba.tokenize(text, 'search', useHMM)
const startOffsetsSet = new Set<number>()
const endOffsetsSet = new Set<number>()
for (const result of results) {
startOffsetsSet.add(result.start)
endOffsetsSet.add(result.end)
}
const startOffsets = Array.from(startOffsetsSet)
startOffsets.sort((a, b) => a - b)
const endOffsets = Array.from(endOffsetsSet)
endOffsets.sort((a, b) => a - b)
const words: string[] = []
for (let i = 0; i < startOffsets.length; i++) {
const wordStartOffset = startOffsets[i]
function getWordEndOffset() {
if (i < startOffsets.length - 1) {
const nextWordStartOffset = startOffsets[i + 1]
for (let j = 0; j < endOffsets.length - 1; j++) {
const currentEndOffset = endOffsets[j]
const nextEndOffset = endOffsets[j + 1]
if (currentEndOffset >= nextWordStartOffset) {
return nextWordStartOffset
} else if (
currentEndOffset > wordStartOffset &&
currentEndOffset < nextWordStartOffset &&
nextEndOffset > nextWordStartOffset) {
return currentEndOffset
}
}
}
return endOffsets[endOffsets.length - 1]
}
const wordEndOffset = getWordEndOffset()
words.push(text.substring(wordStartOffset, wordEndOffset))
}
return words
}
}
let JiebaWasmInstance: typeof import('jieba-wasm')
async function getJiebaWasmInstance() {
if (!JiebaWasmInstance) {
const JiebaWasm = await import('jieba-wasm')
JiebaWasmInstance = JiebaWasm
}
return JiebaWasmInstance
}