UNPKG

fonema

Version:

Ultra-minimal Spanish text cleaning library for TTS - 100% Effect-TS & ESM

896 lines (895 loc) 29.8 kB
import { Effect } from "effect"; // Error types export class TextCleaningError extends Error { code; stage; constructor(message, code, stage) { super(message); this.code = code; this.stage = stage; this.name = "TextCleaningError"; } } // Spanish number conversion maps const CARDINAL_NUMBERS = new Map([ [0, "cero"], [1, "uno"], [2, "dos"], [3, "tres"], [4, "cuatro"], [5, "cinco"], [6, "seis"], [7, "siete"], [8, "ocho"], [9, "nueve"], [10, "diez"], [11, "once"], [12, "doce"], [13, "trece"], [14, "catorce"], [15, "quince"], [16, "dieciséis"], [17, "diecisiete"], [18, "dieciocho"], [19, "diecinueve"], [20, "veinte"], [21, "veintiuno"], [22, "veintidós"], [23, "veintitrés"], [24, "veinticuatro"], [25, "veinticinco"], [26, "veintiséis"], [27, "veintisiete"], [28, "veintiocho"], [29, "veintinueve"], [30, "treinta"], [40, "cuarenta"], [50, "cincuenta"], [60, "sesenta"], [70, "setenta"], [80, "ochenta"], [90, "noventa"], [100, "cien"], [200, "doscientos"], [300, "trescientos"], [400, "cuatrocientos"], [500, "quinientos"], [600, "seiscientos"], [700, "setecientos"], [800, "ochocientos"], [900, "novecientos"], [1000, "mil"], ]); const ORDINAL_NUMBERS = new Map([ [1, "primero"], [2, "segundo"], [3, "tercero"], [4, "cuarto"], [5, "quinto"], [6, "sexto"], [7, "séptimo"], [8, "octavo"], [9, "noveno"], [10, "décimo"], ]); // Spanish abbreviations const ABBREVIATIONS = new Map([ ["Dr.", "Doctor"], ["Dra.", "Doctora"], ["Sr.", "Señor"], ["Sra.", "Señora"], ["Srta.", "Señorita"], ["Prof.", "Profesor"], ["Profa.", "Profesora"], ["Ing.", "Ingeniero"], ["Lic.", "Licenciado"], ["etc.", "etcétera"], ["S.A.", "Sociedad Anónima"], ["Ltda.", "Limitada"], ["Cía.", "Compañía"], ["Av.", "Avenida"], ["C/", "Calle"], ["Pza.", "Plaza"], ["Dpto.", "Departamento"], ]); // Month names for date conversion const MONTHS = [ "enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre", ]; // Emoji descriptions in Spanish const EMOJI_DESCRIPTIONS = new Map([ // Faces and emotions ["😀", "cara sonriente"], ["😃", "cara sonriente con ojos grandes"], ["😄", "cara sonriente con ojos sonrientes"], ["😁", "cara radiante con ojos sonrientes"], ["😆", "cara sonriente con ojos cerrados"], ["😅", "cara sonriente con sudor"], ["🤣", "cara rodando de risa"], ["😂", "cara con lágrimas de alegría"], ["🙂", "cara ligeramente sonriente"], ["🙃", "cara al revés"], ["😉", "cara guiñando"], ["😊", "cara sonriente con ojos sonrientes"], ["😇", "cara sonriente con aureola"], ["🥰", "cara sonriente con corazones"], ["😍", "cara sonriente con ojos de corazón"], ["🤩", "cara con ojos de estrella"], ["😘", "cara mandando un beso"], ["😗", "cara besando"], ["☺️", "cara sonriente"], ["😚", "cara besando con ojos cerrados"], ["😙", "cara besando con ojos sonrientes"], ["🥲", "cara sonriente con lágrima"], ["😋", "cara saboreando comida"], ["😛", "cara sacando la lengua"], ["😜", "cara guiñando y sacando la lengua"], ["🤪", "cara loca"], ["😝", "cara sacando la lengua con ojos cerrados"], ["🤑", "cara con ojos de dinero"], ["🤗", "cara abrazando"], ["🤭", "cara con mano sobre la boca"], ["🤫", "cara haciendo silencio"], ["🤔", "cara pensativa"], ["🤐", "cara con cremallera"], ["🤨", "cara con ceja alzada"], ["😐", "cara neutral"], ["😑", "cara sin expresión"], ["😶", "cara sin boca"], ["😏", "cara sonriendo con malicia"], ["😒", "cara sin gracia"], ["🙄", "cara poniendo los ojos en blanco"], ["😬", "cara haciendo mueca"], ["🤥", "cara mentirosa"], ["😔", "cara pensativa"], ["😕", "cara confundida"], ["🙁", "cara ligeramente fruncida"], ["☹️", "cara fruncida"], ["😣", "cara perseverante"], ["😖", "cara confundida"], ["😫", "cara cansada"], ["😩", "cara llorosa"], ["🥺", "cara suplicante"], ["😢", "cara llorando"], ["😭", "cara llorando a mares"], ["😤", "cara resoplando"], ["😠", "cara enojada"], ["😡", "cara muy enojada"], ["🤬", "cara con símbolos sobre la boca"], ["🤯", "cabeza explotando"], ["😳", "cara sonrojada"], ["🥵", "cara con calor"], ["🥶", "cara con frío"], ["😱", "cara gritando de miedo"], ["😨", "cara temerosa"], ["😰", "cara ansiosa con sudor"], ["😥", "cara triste pero aliviada"], ["😓", "cara con sudor frío"], ["🤗", "cara abrazando"], ["🤔", "cara pensativa"], ["😴", "cara durmiendo"], ["💤", "símbolo de sueño"], ["😪", "cara somnolienta"], ["😵", "cara mareada"], ["🤐", "cara con cremallera"], ["🥴", "cara mareada"], ["🤢", "cara con náuseas"], ["🤮", "cara vomitando"], ["🤧", "cara estornudando"], ["😷", "cara con mascarilla médica"], ["🤒", "cara con termómetro"], ["🤕", "cara con vendaje"], ["🤓", "geek"], // Hearts and love ["❤️", "corazón rojo"], ["🧡", "corazón naranja"], ["💛", "corazón amarillo"], ["💚", "corazón verde"], ["💙", "corazón azul"], ["💜", "corazón morado"], ["🖤", "corazón negro"], ["🤍", "corazón blanco"], ["🤎", "corazón marrón"], ["💔", "corazón roto"], ["❣️", "exclamación de corazón"], ["💕", "dos corazones"], ["💞", "corazones giratorios"], ["💓", "corazón latiendo"], ["💗", "corazón creciendo"], ["💖", "corazón brillante"], ["💘", "corazón con flecha"], ["💝", "corazón con lazo"], ["💟", "decoración de corazón"], // Hands and gestures ["👀", "ojos"], ["👍", "pulgar arriba"], ["👎", "pulgar abajo"], ["👌", "señal de ok"], ["✌️", "señal de victoria"], ["🤞", "dedos cruzados"], ["🤟", "gesto de te amo"], ["🤘", "cuernos"], ["🤙", "llamar"], ["👈", "dedo apuntando a la izquierda"], ["👉", "dedo apuntando a la derecha"], ["👆", "dedo apuntando arriba"], ["👇", "dedo apuntando abajo"], ["☝️", "dedo índice arriba"], ["✋", "mano alzada"], ["🤚", "dorso de la mano alzada"], ["🖐️", "mano con dedos separados"], ["🖖", "saludo vulcano"], ["👋", "mano saludando"], ["🤝", "apretón de manos"], ["🙏", "manos en oración"], ["✍️", "mano escribiendo"], ["👏", "manos aplaudiendo"], ["🙌", "manos celebrando"], ["👐", "manos abiertas"], ["🤲", "palmas hacia arriba"], ["🤜", "puño hacia la derecha"], ["🤛", "puño hacia la izquierda"], ["✊", "puño alzado"], ["👊", "puño"], ["🫶", "manos formando corazón"], // Common objects and symbols ["🔥", "fuego"], ["💯", "cien puntos"], ["🎉", "fiesta"], ["🎊", "confeti"], ["🪅", "piñata"], ["🥂", "brindis"], ["🍾", "champán"], ["💫", "estrella mareada"], ["⭐", "estrella"], ["🌟", "estrella brillante"], ["✨", "destellos"], ["⚡", "rayo"], ["💥", "explosión"], ["💢", "símbolo de enojo"], ["💨", "corriendo"], ["💦", "gotas de sudor"], ["💧", "gota"], ["🌈", "arcoíris"], ["☀️", "sol"], ["⛅", "sol parcialmente nublado"], ["☁️", "nube"], ["🌧️", "nube con lluvia"], ["⛈️", "nube con rayo"], ["🌩️", "nube con rayo"], ["❄️", "copo de nieve"], ["☃️", "muñeco de nieve"], ["⛄", "muñeco de nieve"], ["🌪️", "tornado"], ["🌊", "ola"], // Food and drinks ["🍎", "manzana"], ["🍌", "plátano"], ["🍓", "fresa"], ["🍇", "uvas"], ["🍉", "sandía"], ["🍊", "naranja"], ["🥑", "aguacate"], ["🍅", "tomate"], ["🥕", "zanahoria"], ["🌽", "maíz"], ["🥖", "baguette"], ["🍞", "pan"], ["🧀", "queso"], ["🥓", "tocino"], ["🍖", "carne"], ["🍗", "muslo de pollo"], ["🍕", "pizza"], ["🍔", "hamburguesa"], ["🌭", "hot dog"], ["🥪", "sándwich"], ["🌮", "taco"], ["🌯", "burrito"], ["🍜", "sopa"], ["🍝", "espaguetis"], ["🍚", "arroz"], ["🍛", "curry"], ["🍤", "camarón frito"], ["🍣", "sushi"], ["🍦", "helado"], ["🍰", "pastel"], ["🎂", "pastel de cumpleaños"], ["🍪", "galleta"], ["🍫", "chocolate"], ["🍬", "dulce"], ["🍭", "paleta"], ["☕", "café"], ["🍵", "té"], ["🥤", "bebida"], ["🍺", "cerveza"], ["🍷", "vino"], ["🥂", "brindis"], ["🍾", "champán"], // Animals ["🐶", "cara de perro"], ["🐱", "cara de gato"], ["🐭", "cara de ratón"], ["🐹", "cara de hámster"], ["🐰", "cara de conejo"], ["🦊", "cara de zorro"], ["🐻", "cara de oso"], ["🐼", "cara de panda"], ["🐨", "koala"], ["🐯", "cara de tigre"], ["🦁", "cara de león"], ["🐮", "cara de vaca"], ["🐷", "cara de cerdo"], ["🐸", "cara de rana"], ["🐵", "cara de mono"], ["🙈", "mono que no ve"], ["🙉", "mono que no oye"], ["🙊", "mono que no habla"], ["🐒", "mono"], ["🐔", "pollo"], ["🐧", "pingüino"], ["🐦", "pájaro"], ["🐤", "pollito"], ["🐣", "pollito saliendo del huevo"], ["🐥", "pollito de frente"], ["🦆", "pato"], ["🦅", "águila"], ["🦉", "búho"], ["🦇", "murciélago"], ["🐺", "lobo"], ["🐗", "jabalí"], ["🐴", "cara de caballo"], ["🦄", "unicornio"], ["🐝", "abeja"], ["🐛", "gusano"], ["🦋", "mariposa"], ["🐌", "caracol"], ["🐞", "mariquita"], ["🐜", "hormiga"], ["🦗", "grillo"], ["🕷️", "araña"], ["🦂", "escorpión"], ["🐢", "tortuga"], ["🐍", "serpiente"], ["🦎", "lagarto"], ["🐙", "pulpo"], ["🦑", "calamar"], ["🦐", "camarón"], ["🦀", "cangrejo"], ["🐡", "pez globo"], ["🐠", "pez tropical"], ["🐟", "pez"], ["🐬", "delfín"], ["🐳", "ballena"], ["🐋", "ballena"], ["🦈", "tiburón"], // Activities and sports ["⚽", "balón de fútbol"], ["🏀", "balón de baloncesto"], ["🏈", "balón de fútbol americano"], ["⚾", "béisbol"], ["🥎", "softball"], ["🎾", "tenis"], ["🏐", "voleibol"], ["🏉", "rugby"], ["🥏", "frisbee"], ["🎱", "bola ocho"], ["🪀", "yoyo"], ["🏓", "ping pong"], ["🏸", "bádminton"], ["🥅", "portería"], ["⛳", "golf"], ["🪁", "cometa"], ["🏹", "arco y flecha"], ["🎣", "pesca"], ["🤿", "buceo"], ["🥊", "boxeo"], ["🥋", "artes marciales"], ["🎽", "camiseta de correr"], ["🛹", "patineta"], ["🛷", "trineo"], ["⛸️", "patín de hielo"], ["🥌", "curling"], ["🎿", "esquí"], ["⛷️", "esquiador"], ["🏂", "snowboard"], ["🪂", "paracaídas"], ["🏋️", "levantamiento de pesas"], ["🤸", "voltereta"], ["🤼", "lucha"], ["🤽", "waterpolo"], ["🤾", "balonmano"], ["🤹", "malabarismo"], ["🧘", "meditación"], ["🛀", "baño"], ["🛌", "durmiendo"], // Travel and places ["🚗", "coche"], ["🚕", "taxi"], ["🚙", "SUV"], ["🚌", "autobús"], ["🚎", "trolebús"], ["🏎️", "coche de carreras"], ["🚓", "coche de policía"], ["🚑", "ambulancia"], ["🚒", "camión de bomberos"], ["🚐", "minibús"], ["🛻", "camioneta"], ["🚚", "camión"], ["🚛", "camión articulado"], ["🚜", "tractor"], ["🏍️", "motocicleta"], ["🛵", "scooter"], ["🚲", "bicicleta"], ["🛴", "patinete"], ["🚁", "helicóptero"], ["✈️", "avión"], ["🛩️", "avión pequeño"], ["🚀", "cohete"], ["🛸", "platillo volador"], ["🚢", "barco"], ["⛵", "velero"], ["🚤", "lancha"], ["⛴️", "ferry"], ["🛥️", "lancha motora"], ["🚂", "locomotora"], ["🚃", "vagón de tren"], ["🚄", "tren bala"], ["🚅", "tren bala con nariz"], ["🚆", "tren"], ["🚇", "metro"], ["🚈", "tren ligero"], ["🚉", "estación"], ["🚊", "tranvía"], ["🚝", "monorraíl"], ["🚞", "tren de montaña"], ["🚟", "tren suspendido"], ["🚠", "teleférico"], ["🚡", "tranvía aéreo"], ["🛰️", "satélite"], ["🚁", "helicóptero"], // Objects and tools ["📱", "teléfono móvil"], ["💻", "portátil"], ["🖥️", "ordenador de escritorio"], ["⌨️", "teclado"], ["🖱️", "ratón de ordenador"], ["🖲️", "trackball"], ["💽", "minidisc"], ["💾", "disquete"], ["💿", "CD"], ["📀", "DVD"], ["🧮", "ábaco"], ["🎥", "cámara de cine"], ["📹", "videocámara"], ["📷", "cámara"], ["📸", "cámara con flash"], ["📼", "videocasete"], ["🔍", "lupa"], ["🔎", "lupa hacia la derecha"], ["🕯️", "vela"], ["💡", "bombilla"], ["🔦", "linterna"], ["🏮", "farol rojo"], ["🪔", "lámpara de aceite"], ["📔", "cuaderno"], ["📕", "libro cerrado"], ["📖", "libro abierto"], ["📗", "libro verde"], ["📘", "libro azul"], ["📙", "libro naranja"], ["📚", "libros"], ["📓", "cuaderno"], ["📒", "libro de contabilidad"], ["📃", "página curvada"], ["📜", "pergamino"], ["📄", "página"], ["📰", "periódico"], ["🗞️", "periódico enrollado"], ["📑", "marcadores"], ["🔖", "marcapáginas"], ["🏷️", "etiqueta"], ["💰", "bolsa de dinero"], ["🪙", "moneda"], ["💴", "yen"], ["💵", "dólar"], ["💶", "euro"], ["💷", "libra"], ["💸", "dinero con alas"], ["💳", "tarjeta de crédito"], ["🧾", "recibo"], ["💎", "diamante"], ["⚖️", "balanza"], ["🪜", "escalera"], ["🧰", "caja de herramientas"], ["🔧", "llave inglesa"], ["🔨", "martillo"], ["⚒️", "martillo y pico"], ["🛠️", "martillo y llave"], ["⛏️", "pico"], ["🪓", "hacha"], ["🪚", "sierra"], ["🔩", "tuerca y tornillo"], ["⚙️", "engranaje"], ["🪤", "trampa para ratones"], ["🧲", "imán"], ["🪣", "cubo"], ["🧽", "esponja"], ["🧴", "botella de loción"], ["🧷", "imperdible"], ["🧹", "escoba"], ["🧺", "cesta"], ["🪑", "silla"], ["🚪", "puerta"], ["🪟", "ventana"], ["🛏️", "cama"], ["🛋️", "sofá"], ["🪑", "silla"], ["🚿", "ducha"], ["🛁", "bañera"], ["🚽", "inodoro"], ["🪠", "desatascador"], ["🧻", "papel higiénico"], ["🪥", "cepillo de dientes"], ["🧼", "jabón"], ["🪒", "maquinilla de afeitar"], ["🧴", "botella de loción"], ["🧽", "esponja"], ["🧯", "extintor"], ["🛒", "carrito de compras"], // Symbols and signs ["❗", "exclamación"], ["❓", "interrogación"], ["❔", "interrogación blanca"], ["❕", "exclamación blanca"], ["❌", "cruz"], ["⭕", "círculo rojo"], ["🚫", "prohibido"], ["💯", "cien puntos"], ["💢", "símbolo de enojo"], ["💥", "explosión"], ["💫", "estrella mareada"], ["💦", "gotas de sudor"], ["💨", "corriendo"], ["🕳️", "agujero"], ["💣", "bomba"], ["💬", "globo de diálogo"], ["👁️‍🗨️", "ojo en globo de diálogo"], ["🗨️", "globo de diálogo izquierdo"], ["🗯️", "globo de diálogo de enfado"], ["💭", "globo de pensamiento"], ["💤", "símbolo de sueño"], // Flags (some common ones) ["🏳️", "bandera blanca"], ["🏴", "bandera negra"], ["🏁", "bandera a cuadros"], ["🚩", "bandera triangular"], ["🏳️‍🌈", "bandera del arcoíris"], ["🏳️‍⚧️", "bandera transgénero"], // Time and calendar ["⏰", "despertador"], ["⏱️", "cronómetro"], ["⏲️", "temporizador"], ["🕐", "una en punto"], ["🕑", "dos en punto"], ["🕒", "tres en punto"], ["🕓", "cuatro en punto"], ["🕔", "cinco en punto"], ["🕕", "seis en punto"], ["🕖", "siete en punto"], ["🕗", "ocho en punto"], ["🕘", "nueve en punto"], ["🕙", "diez en punto"], ["🕚", "once en punto"], ["🕛", "doce en punto"], ["📅", "calendario"], ["📆", "calendario de mesa"], ["🗓️", "calendario espiral"], ["📇", "fichero"], ["🗃️", "archivador"], ["🗄️", "archivador"], ["🗂️", "separadores"], // Music and entertainment ["🎵", "nota musical"], ["🎶", "notas musicales"], ["🎼", "partitura"], ["🎹", "piano"], ["🥁", "tambor"], ["🎷", "saxofón"], ["🎺", "trompeta"], ["🎸", "guitarra"], ["🪕", "banjo"], ["🎻", "violín"], ["🎤", "micrófono"], ["🎧", "auriculares"], ["📻", "radio"], ["📺", "televisión"], ["🎬", "claqueta"], ["🎭", "máscaras de teatro"], ["🎪", "circo"], ["🎨", "paleta de pintor"], ["🎯", "diana"], ["🎲", "dado"], ["🎮", "videojuego"], ["🕹️", "joystick"], ["🎰", "máquina tragaperras"], ["🎳", "bolos"], // Magic and fantasy ["🪄", "varita mágica"], ["🔮", "bola de cristal"], ["🧿", "ojo turco"], ["🪬", "mano de Fátima"], ["🎃", "calabaza de Halloween"], ["👻", "fantasma"], ["💀", "calavera"], ["☠️", "calavera y huesos"], ["👽", "alienígena"], ["👾", "monstruo de videojuego"], ["🤖", "robot"], ["🎅", "Papá Noel"], ["🤶", "Mamá Noel"], ["🧙", "mago"], ["🧚", "hada"], ["🧛", "vampiro"], ["🧜", "sirena"], ["🧝", "elfo"], ["🧞", "genio"], ["🧟", "zombi"], ["🦸", "superhéroe"], ["🦹", "supervillano"], ["🤺", "esgrima"], ["🏇", "carrera de caballos"], ["⛷️", "esquí"], ["🏂", "snowboard"], ["🏌️", "golf"], ["🏄", "surf"], ["🚣", "remo"], ["🏊", "natación"], ["⛹️", "baloncesto"], ["🏋️", "levantamiento de pesas"], ["🚴", "ciclismo"], ["🚵", "ciclismo de montaña"], ["🤸", "voltereta"], ["🤼", "lucha"], ["🤽", "waterpolo"], ["🤾", "balonmano"], ["🤹", "malabarismo"], ["🧘", "meditación"], ["🛀", "baño"], ["🛌", "durmiendo"], ]); // Convert number to Spanish words const convertNumberToSpanish = (num) => { if (num === 0) return "cero"; if (num < 0) return `menos ${convertNumberToSpanish(-num)}`; if (CARDINAL_NUMBERS.has(num)) { return CARDINAL_NUMBERS.get(num); } if (num < 100) { const tens = Math.floor(num / 10) * 10; const units = num % 10; if (units === 0) return CARDINAL_NUMBERS.get(tens); return `${CARDINAL_NUMBERS.get(tens)} y ${CARDINAL_NUMBERS.get(units)}`; } if (num < 1000) { const hundreds = Math.floor(num / 100) * 100; const remainder = num % 100; if (remainder === 0) return CARDINAL_NUMBERS.get(hundreds); return `${CARDINAL_NUMBERS.get(hundreds)} ${convertNumberToSpanish(remainder)}`; } if (num < 1000000) { const thousands = Math.floor(num / 1000); const remainder = num % 1000; const thousandsText = thousands === 1 ? "mil" : `${convertNumberToSpanish(thousands)} mil`; if (remainder === 0) return thousandsText; return `${thousandsText} ${convertNumberToSpanish(remainder)}`; } // For larger numbers, return as is for now return num.toString(); }; // Remove code blocks but preserve inline code const cleanCodeContent = (text) => Effect.try({ try: () => { // Remove triple backtick code blocks let cleaned = text.replace(/```[\s\S]*?```/g, ""); // Remove indented code blocks (4+ spaces) cleaned = cleaned.replace(/^[ \t]{4,}.*$/gm, ""); // Preserve inline code content (remove backticks but keep content) cleaned = cleaned.replace(/`([^`]+)`/g, "$1"); return cleaned; }, catch: (error) => new TextCleaningError(`Code cleaning failed: ${error}`, "CODE_CLEANING_FAILED", "PRE_PROCESSING"), }); // Clean markdown formatting const cleanMarkdown = (text) => Effect.try({ try: () => { let cleaned = text; // Remove headers but keep text cleaned = cleaned.replace(/^#{1,6}\s+/gm, ""); cleaned = cleaned.replace(/#{1,6}\s+/g, ""); // Headers in middle of text // Remove bold/italic formatting but keep text cleaned = cleaned.replace(/\*\*([^*]+)\*\*/g, "$1"); // **bold** cleaned = cleaned.replace(/\*([^*]+)\*/g, "$1"); // *italic* cleaned = cleaned.replace(/__([^_]+)__/g, "$1"); // __bold__ cleaned = cleaned.replace(/_([^_]+)_/g, "$1"); // _italic_ // Remove strikethrough but keep text cleaned = cleaned.replace(/~~([^~]+)~~/g, "$1"); // Remove links but keep text cleaned = cleaned.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1"); // Remove images completely cleaned = cleaned.replace(/!\[([^\]]*)\]\([^)]+\)/g, ""); // Remove HTML tags but keep content cleaned = cleaned.replace(/<[^>]*>/g, ""); // Remove blockquotes cleaned = cleaned.replace(/^>\s*/gm, ""); // Remove list markers cleaned = cleaned.replace(/^[-*+]\s+/gm, ""); cleaned = cleaned.replace(/^\d+\.\s+/gm, ""); // Remove horizontal rules cleaned = cleaned.replace(/^[-*_]{3,}$/gm, ""); // Remove remaining underscores that might be formatting artifacts cleaned = cleaned.replace(/_{2,}/g, " "); // Multiple underscores cleaned = cleaned.replace(/\b_+\b/g, " "); // Standalone underscores cleaned = cleaned.replace(/(\w)_+(\w)/g, "$1 $2"); // Underscores between words // Remove table formatting cleaned = cleaned.replace(/\|/g, " "); // Table separators // Remove footnote references cleaned = cleaned.replace(/\[\^[^\]]+\]/g, ""); // Remove reference-style links cleaned = cleaned.replace(/^\[[^\]]+\]:\s*.+$/gm, ""); return cleaned; }, catch: (error) => new TextCleaningError(`Markdown cleaning failed: ${error}`, "MARKDOWN_CLEANING_FAILED", "PRE_PROCESSING"), }); // Convert Spanish numbers const convertNumbers = (text) => Effect.try({ try: () => { let result = text; // Convert integers with commas (1,234) result = result.replace(/\b\d{1,3}(?:,\d{3})+\b/g, (match) => { const num = parseInt(match.replace(/,/g, "")); return convertNumberToSpanish(num); }); // Convert simple integers result = result.replace(/\b\d+\b/g, (match) => { const num = parseInt(match); if (num >= 0 && num <= 999999) { return convertNumberToSpanish(num); } return match; }); // Convert ordinals (1º, 2ª) result = result.replace(/\b(\d+)[ºª]\b/g, (match, numStr) => { const num = parseInt(numStr); return ORDINAL_NUMBERS.get(num) || `${convertNumberToSpanish(num)}º`; }); // Convert percentages result = result.replace(/\b(\d+(?:\.\d+)?)%\b/g, (match, numStr) => { const num = parseFloat(numStr); return `${convertNumberToSpanish(Math.floor(num))} por ciento`; }); return result; }, catch: (error) => new TextCleaningError(`Number conversion failed: ${error}`, "NUMBER_CONVERSION_FAILED", "NORMALIZATION"), }); // Expand abbreviations const expandAbbreviations = (text) => Effect.try({ try: () => { let result = text; for (const [abbrev, expansion] of ABBREVIATIONS) { const regex = new RegExp(`\\b${abbrev.replace(/\./g, "\\.")}`, "g"); result = result.replace(regex, expansion); } return result; }, catch: (error) => new TextCleaningError(`Abbreviation expansion failed: ${error}`, "ABBREVIATION_EXPANSION_FAILED", "NORMALIZATION"), }); // Convert dates (15/03/2024 → "quince de marzo de dos mil veinticuatro") const convertDates = (text) => Effect.try({ try: () => { return text.replace(/(\d{1,2})\/(\d{1,2})\/(\d{4})/g, (match, day, month, year) => { const dayNum = parseInt(day); const monthNum = parseInt(month) - 1; // Convert to 0-indexed const yearNum = parseInt(year); if (monthNum >= 0 && monthNum < 12) { const dayText = convertNumberToSpanish(dayNum); const monthText = MONTHS[monthNum]; const yearText = convertNumberToSpanish(yearNum); return `${dayText} de ${monthText} de ${yearText}`; } return match; }); }, catch: (error) => new TextCleaningError(`Date conversion failed: ${error}`, "DATE_CONVERSION_FAILED", "NORMALIZATION"), }); // Remove URLs and emails const removeDigitalContent = (text) => Effect.try({ try: () => { let cleaned = text; // Remove URLs cleaned = cleaned.replace(/https?:\/\/[^\s]+/g, ""); cleaned = cleaned.replace(/www\.[^\s]+/g, ""); // Remove email addresses cleaned = cleaned.replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, ""); // Remove social media handles cleaned = cleaned.replace(/@\w+/g, ""); cleaned = cleaned.replace(/#\w+/g, ""); return cleaned; }, catch: (error) => new TextCleaningError(`Digital content removal failed: ${error}`, "DIGITAL_CONTENT_REMOVAL_FAILED", "PRE_PROCESSING"), }); // Convert emojis to Spanish descriptions const convertEmojis = (text) => Effect.try({ try: () => { let result = text; // Convert known emojis to their Spanish descriptions for (const [emoji, description] of EMOJI_DESCRIPTIONS) { const regex = new RegExp(emoji.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g"); result = result.replace(regex, ` emoji de ${description} `); } // Handle any remaining emojis with a generic approach // This regex matches most emoji characters const emojiRegex = /[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]/gu; result = result.replace(emojiRegex, " emoji "); // Clean up multiple spaces that might have been created result = result.replace(/\s+/g, " "); return result.trim(); }, catch: (error) => new TextCleaningError(`Emoji conversion failed: ${error}`, "EMOJI_CONVERSION_FAILED", "NORMALIZATION"), }); // Normalize punctuation according to RAE rules const normalizePunctuation = (text) => Effect.try({ try: () => { let normalized = text; // Spanish quotation marks normalized = normalized.replace(/"([^"]+)"/g, "«$1»"); // Proper spacing around punctuation normalized = normalized.replace(/\s*([.,;:!?])\s*/g, "$1 "); // Ellipsis normalization normalized = normalized.replace(/\.{3,}/g, "…"); // Em-dash handling normalized = normalized.replace(/--/g, "—"); normalized = normalized.replace(/\s*—\s*/g, " — "); // Clean up multiple spaces normalized = normalized.replace(/\s+/g, " "); return normalized.trim(); }, catch: (error) => new TextCleaningError(`Punctuation normalization failed: ${error}`, "PUNCTUATION_NORMALIZATION_FAILED", "POST_PROCESSING"), }); // Main text cleaning function export const cleanTextForTTS = (text) => Effect.gen(function* () { // Pipeline: each stage processes the text sequentially const afterCodeCleaning = yield* cleanCodeContent(text); const afterMarkdownCleaning = yield* cleanMarkdown(afterCodeCleaning); const afterDigitalRemoval = yield* removeDigitalContent(afterMarkdownCleaning); const afterEmojis = yield* convertEmojis(afterDigitalRemoval); const afterAbbreviations = yield* expandAbbreviations(afterEmojis); const afterDates = yield* convertDates(afterAbbreviations); const afterNumbers = yield* convertNumbers(afterDates); const final = yield* normalizePunctuation(afterNumbers); return final; }); // Utility function for standalone number conversion export const convertSpanishNumber = (num) => convertNumberToSpanish(num); // Utility function for standalone abbreviation expansion export const expandSpanishAbbreviation = (abbrev) => ABBREVIATIONS.get(abbrev) || abbrev; // Utility function for standalone emoji conversion export const convertEmojiToSpanish = (emoji) => { const description = EMOJI_DESCRIPTIONS.get(emoji); return description ? `emoji de ${description}` : "emoji"; };