@lobehub/tts
Version:
A high-quality & reliable TTS React Hooks library
36 lines (34 loc) • 1.69 kB
JavaScript
import { markdownToTxt } from "markdown-to-txt";
//#region src/core/utils/splitTextIntoSegments.ts
const toHalfWidthAndCleanSpace = (str) => {
return markdownToTxt(str).replaceAll(/[\uFF01-\uFF5E]/g, (ch) => String.fromCharCode(ch.charCodeAt(0) - 65248)).replaceAll(" ", " ").replaceAll("。", ".").replaceAll(",", ",").replaceAll("!", "!").replaceAll("?", "?").replaceAll(";", ";").replaceAll(":", ":").replaceAll("(", "(").replaceAll(")", ")").replaceAll("【", "[").replaceAll("】", "]").replaceAll("《", "<").replaceAll("》", ">").replaceAll("“", "\"").replaceAll("”", "\"").replaceAll("‘", "'").replaceAll("’", "'").replaceAll("\n", ". ").replaceAll(/\s+/g, " ");
};
const splitTextIntoSegments = (text, chunkSize = 100) => {
text = toHalfWidthAndCleanSpace(text);
const chunks = [];
const paragraphs = text.split("\n");
let currentChunk = "";
function addChunk(chunk) {
if (chunk.trim()) chunks.push(chunk.trim());
}
for (const paragraph of paragraphs) {
if (currentChunk.length + paragraph.length + 1 > chunkSize && currentChunk.length > 0) {
addChunk(currentChunk);
currentChunk = "";
}
if (paragraph.length > chunkSize) {
const sentences = paragraph.match(/[^!.?]+[!.?]+/g) || [paragraph];
for (const sentence of sentences) {
if (currentChunk.length + sentence.length + 1 > chunkSize && currentChunk.length > 0) {
addChunk(currentChunk);
currentChunk = "";
}
currentChunk += (currentChunk ? " " : "") + sentence.trim();
}
} else currentChunk += (currentChunk ? "\n" : "") + paragraph;
}
if (currentChunk) addChunk(currentChunk);
return chunks;
};
//#endregion
export { splitTextIntoSegments };