UNPKG

paragrafs

Version:

A lightweight TypeScript library designed to reconstruct paragraphs from AI transcriptions.

1 lines 69 kB
{"version":3,"file":"index.mjs","names":["DEFAULT_HINT_NORMALIZATION: Required<ArabicNormalizationOptions>","map: HintMap","result: string[]","table: LCSTable","result: GroundedToken[]","marked: MarkedToken[]","prevEnd: null | number","segments: MarkedSegment[]","currentSegment: MarkedToken[]","segmentStart: null | number","segmentEnd: null | number","result: MarkedSegment[]","lines: string[]","buffer: Token[]","bufferStart: null | number","textParts: string[]","flattenedTokens: Token[]","result: MarkedToken[]","DEFAULT_NORMALIZATION: Required<NonNullable<GenerateHintsOptions['normalization']>>","DEFAULTS: Required<Pick<GenerateHintsOptions, 'dedupe' | 'maxN' | 'minCount' | 'minN'>>","results: GeneratedHint[]","combined"],"sources":["../src/utils/constants.ts","../src/utils/textUtils.ts","../src/utils/lcs.ts","../src/utils/transcriptUtils.ts","../src/transcript.ts","../src/utils/hints.ts"],"sourcesContent":["/**\n * Constant used to mark segment breaks during processing.\n */\nexport const SEGMENT_BREAK = 'SEGMENT_BREAK';\nexport type SegmentBreakMarker = typeof SEGMENT_BREAK;\n\n/**\n * Constant used to mark that we should always start a break when encountering this.\n */\nexport const ALWAYS_BREAK = 'ALWAYS_BREAK';\nexport type AlwaysBreakMarker = typeof ALWAYS_BREAK;\n","import type { ArabicNormalizationOptions, HintMap, Hints } from '../types';\n\n/**\n * Checks if a text string ends with sentence-ending punctuation.\n * Supports: period (.), question mark (? / ؟), exclamation (!), Arabic semicolon (؛), ellipsis (…).\n *\n * @param {string} text - The text to check for ending punctuation\n * @returns {boolean} True if the text ends with punctuation, false otherwise\n */\nexport const isEndingWithPunctuation = (text: string): boolean => /[.؟!?؛…]$/.test(text);\n\n/**\n * Formats seconds into a human-readable timestamp.\n * For durations less than an hour: m:ss (e.g., \"1:05\")\n * For durations an hour or longer: h:mm:ss (e.g., \"1:02:05\")\n *\n * @param {number} seconds - The time duration in seconds\n * @returns {string} Formatted timestamp string\n */\nexport const formatSecondsToTimestamp = (seconds: number): string => {\n const hrs = Math.floor(seconds / 3600);\n const mins = Math.floor((seconds % 3600) / 60);\n const secs = Math.floor(seconds % 60);\n return hrs > 0\n ? `${hrs}:${mins.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')}`\n : `${mins}:${secs.toString().padStart(2, '0')}`;\n};\n\n/**\n * Strip leading/trailing punctuation/symbols, remove Arabic diacritics, NFC-normalize.\n * Normalizes a word by removing diacritics and punctuation.\n *\n * This function:\n * 1. Decomposes Unicode characters (NFD normalization)\n * 2. Removes Arabic diacritics\n * 3. Strips leading and trailing punctuation or symbols\n * 4. Recomposes Unicode characters (NFC normalization)\n *\n * @param {string} w - The word to normalize\n * @returns {string} The normalized word\n */\nexport const normalizeWord = (w: string) => {\n return (\n w\n // Decompose to strip diacritics\n .normalize('NFD')\n // Remove common zero-width / format characters that can sneak into Arabic text.\n .replace(/[\\u200B-\\u200D\\uFEFF]/g, '')\n // Remove Arabic diacritic marks and other common combining marks\n .replace(/\\p{Mn}/gu, '')\n .replace(/[\\u064B-\\u065F]/g, '')\n // Strip any punctuation, symbol, or format char at start/end (Unicode property escapes)\n .replace(/^[\\p{P}\\p{S}\\p{Cf}]+|[\\p{P}\\p{S}\\p{Cf}]+$/gu, '')\n // Recompose\n .normalize('NFC')\n );\n};\n\n/**\n * Normalizes token text for Arabic-first matching and mining.\n *\n * This builds on `normalizeWord` (diacritics + trim punctuation) and adds optional\n * Arabic-specific normalizations. Use the same normalization for:\n * - mining repeated sequences\n * - matching hints against tokens\n *\n * @param text The token text to normalize\n * @param options Optional Arabic-specific normalizations\n * @returns A normalized token string suitable for comparisons\n */\nexport const normalizeTokenText = (text: string, options?: ArabicNormalizationOptions): string => {\n let input = text;\n\n // Preserve hamza information before we strip combining marks.\n // In NFD, ؤ/ئ decompose into base letter + U+0654 (hamza above).\n // We collapse waw/ya hamza seats to a standalone hamza, while leaving alef hamza\n // to be handled by normalizeAlef (or dropped if normalizeAlef is enabled).\n if (options?.normalizeHamza) {\n input = input\n .normalize('NFD')\n // ya/waw seats can have additional vowel marks between the base letter and hamza above in NFD.\n .replace(/\\u064A\\p{Mn}*\\u0654/gu, 'ء') // ي + Mn* + ٔ\n .replace(/\\u0648\\p{Mn}*\\u0654/gu, 'ء') // و + Mn* + ٔ\n .replace(/[\\u0654\\u0655]/g, '') // drop remaining hamza combining marks (e.g., أ/إ)\n .normalize('NFC');\n }\n\n let normalized = normalizeWord(input);\n\n if (options?.removeTatweel) {\n normalized = normalized.replace(/\\u0640/g, '');\n }\n\n if (options?.normalizeAlef) {\n normalized = normalized.replace(/[أإآ]/g, 'ا');\n }\n\n if (options?.normalizeYa) {\n normalized = normalized.replace(/ى/g, 'ي');\n }\n\n return normalized;\n};\n\n/**\n * Creates a map of hints organized by their first word.\n *\n * Takes multiple hint strings, splits each into words, and organizes them into\n * a map where the keys are the first words and values are arrays of word arrays.\n *\n * @param {...string} hints - One or more hint strings to process\n * @returns {Hints} A map of hints organized by their first word\n */\nconst DEFAULT_HINT_NORMALIZATION: Required<ArabicNormalizationOptions> = {\n normalizeAlef: true,\n normalizeHamza: false,\n normalizeYa: true,\n removeTatweel: true,\n};\n\n/**\n * Creates normalized hints for robust Arabic matching (diacritics/punctuation tolerant).\n *\n * Breaking change: hints are now normalized by default. This is intended for Arabic ASR.\n *\n * @param first Either the first hint string, or an options object overriding the default normalization.\n * @param restHints Remaining hint strings, if the first argument was an options object.\n * @returns A normalized hint map plus the normalization settings used for matching.\n */\nexport const createHints = (first: ArabicNormalizationOptions | string, ...restHints: string[]): Hints => {\n const map: HintMap = {};\n\n const [options, hints] =\n typeof first === 'string'\n ? [DEFAULT_HINT_NORMALIZATION, [first, ...restHints]]\n : [{ ...DEFAULT_HINT_NORMALIZATION, ...first }, restHints];\n\n for (const hint of hints) {\n const words = hint\n .split(/\\s+/)\n .map((w) => normalizeTokenText(w, options))\n .filter(Boolean);\n\n if (words.length === 0) {\n continue;\n }\n\n const firstWord = words[0];\n if (!map[firstWord]) {\n map[firstWord] = [];\n }\n map[firstWord].push(words);\n }\n\n return { map, normalization: options };\n};\n\n/**\n * Tokenizes ground truth text properly, ensuring punctuation is attached to words\n * rather than creating separate tokens.\n * @param groundTruth The ground truth to tokenize.\n * @returns The tokenized ground truth with the punctuations properly attached.\n */\nexport const tokenizeGroundTruth = (groundTruth: string): string[] => {\n // First, let's split on whitespace and newlines to get word candidates\n const rawTokens = groundTruth\n .trim()\n .split(/\\s+/)\n .map((t) => t.trim())\n .filter(Boolean);\n const result: string[] = [];\n\n for (const token of rawTokens) {\n // Check if this token is just punctuation that should be attached to the previous word\n // Updated regex to properly handle Arabic punctuation and other punctuation marks\n if (result.length > 0 && /^[\\p{P}\\p{S}]+$/u.test(token)) {\n // Attach punctuation to the previous word\n result[result.length - 1] += token;\n } else {\n result.push(token);\n }\n }\n\n return result;\n};\n","type LCSTable = number[][];\n\n/**\n * Builds a dynamic programming table for Longest Common Subsequence (LCS).\n *\n * @param a - Normalized list of original token strings\n * @param b - Normalized list of ground truth words\n * @returns 2D array representing the LCS table (dimensions: (a.length + 1) x (b.length + 1))\n *\n * @complexity O(m * n) where m and n are lengths of `a` and `b`\n */\nexport const buildLcsTable = (a: string[], b: string[]) => {\n const m = a.length;\n const n = b.length;\n const table: LCSTable = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));\n\n for (let i = 0; i < m; i++) {\n for (let j = 0; j < n; j++) {\n if (a[i] === b[j]) {\n table[i + 1][j + 1] = table[i][j] + 1;\n } else {\n table[i + 1][j + 1] = Math.max(table[i][j + 1], table[i + 1][j]);\n }\n }\n }\n return table;\n};\n\n/**\n * Extracts index pairs of matched words from the LCS table.\n *\n * Backtracks through the LCS table to find all aligned index pairs\n * between the original and ground truth arrays.\n *\n * @param table - LCS dynamic programming table\n * @param original - Normalized original token texts\n * @param ground - Normalized ground truth words\n * @returns Array of match objects with `gtIndex` and `origIndex` pairs\n */\nexport const extractLcsMatches = (table: LCSTable, original: string[], ground: string[]) => {\n const matches = new Map<number, number>();\n let i = original.length;\n let j = ground.length;\n\n while (i > 0 && j > 0) {\n if (original[i - 1] === ground[j - 1]) {\n matches.set(i - 1, j - 1);\n i--;\n j--;\n } else if (table[i - 1][j] >= table[i][j - 1]) {\n i--;\n } else {\n j--;\n }\n }\n return matches;\n};\n","import type { GroundedToken, Hints, Token } from '@/types';\n\nimport { buildLcsTable, extractLcsMatches } from './lcs';\nimport { normalizeWord, tokenizeGroundTruth } from './textUtils';\n\n/**\n * Determines whether any hint phrase in `hints` matches the sequence of normalized token texts\n * starting at the given index.\n *\n * Looks up candidate word arrays under the key `normalizedTokens[index]` in the `hints` map,\n * then for each candidate phrase checks if every word matches the corresponding\n * normalized token text at successive positions.\n *\n * @param normalizedTokens\n * The full array of normalized token text strings being scanned.\n * @param hints\n * A `Hints` map (from first word to arrays of word arrays), as produced by `createHints`.\n * @param index\n * The position in `normalizedTokens` at which to try matching each hint phrase.\n * @returns\n * `true` if at least one hint phrase completely matches the normalized token text starting at `index`;\n * otherwise `false`.\n *\n * @example\n * ```ts\n * const normalizedTokens = ['jump', 'over', 'the', 'moon'];\n * const hints = createHints({ normalizeAlef: false, normalizeYa: false, normalizeHamza: false, removeTatweel: false }, 'jump over', 'the moon');\n *\n * isHintMatched(normalizedTokens, hints, 0);\n * // → true (matches ['jump','over'])\n *\n * isHintMatched(normalizedTokens, hints, 2);\n * // → true (matches ['the','moon'])\n *\n * isHintMatched(normalizedTokens, hints, 1);\n * // → false (no hint starts with 'over')\n * ```\n */\nconst isHintSequenceMatchedAtIndex = (normalizedTokens: string[], words: string[], index: number): boolean => {\n if (index + words.length > normalizedTokens.length) {\n return false;\n }\n\n for (let k = 0; k < words.length; k++) {\n if (normalizedTokens[index + k] !== words[k]) {\n return false;\n }\n }\n\n return true;\n};\n\nexport const isHintMatched = (normalizedTokens: string[], hints: Hints, index: number) => {\n const key = normalizedTokens[index];\n const candidates = hints.map[key];\n\n if (!candidates) {\n return false;\n }\n\n for (const words of candidates) {\n if (isHintSequenceMatchedAtIndex(normalizedTokens, words, index)) {\n return true;\n }\n }\n\n return false;\n};\n\n/**\n * @typedef {object} CreateInsertionTokenProps\n * @property {string[]} gtGap - The list of ground truth words in the current gap.\n * @property {number} gtGapIndex - The index of the current word within the `gtGap`.\n * @property {Token[]} tokenGap - The list of original tokens in the current gap.\n * @property {Token | null} prevToken - The last processed token before the gap.\n * @property {Token} nextToken - The next anchor token that defines the end of the gap.\n */\ntype CreateInsertionTokenProps = {\n gtGap: string[];\n gtGapIndex: number;\n nextToken: Token;\n prevToken: null | Token;\n tokenGap: Token[];\n};\n\n/**\n * Creates a new token for an inserted ground truth word.\n * It estimates the start and end times by distributing the available time\n * within the gap between the previous and next anchor tokens.\n *\n * @param {string} text - The text of the token to be inserted.\n * @param {CreateInsertionTokenProps} props - The contextual information for the insertion.\n * @returns {Token} A new token with estimated timing.\n */\nconst createInsertionToken = (\n text: string,\n { gtGap, gtGapIndex, nextToken, prevToken, tokenGap }: CreateInsertionTokenProps,\n): Token => {\n const gapStartTime = prevToken?.end ?? 0;\n const gapEndTime = nextToken.start;\n const timeAvailable = Math.max(0, gapEndTime - gapStartTime);\n\n // Distribute the available time amongst all words that need to be inserted in this gap.\n const itemsToInsert = gtGap.length - tokenGap.length;\n const timePerItem = itemsToInsert > 0 ? timeAvailable / itemsToInsert : 0;\n\n // Calculate the position of *this specific word* within the set of insertions.\n const insertionIndex = gtGapIndex - tokenGap.length;\n const start = gapStartTime + insertionIndex * timePerItem;\n const end = start + timePerItem;\n\n return { end, start, text };\n};\n\n/**\n * Identifies and returns a sorted list of reliable alignment points (anchors)\n * between the token and ground truth sequences.\n * @returns An array of [tokenIndex, gtIndex] pairs.\n */\nconst findAnchors = (tokens: Token[], groundTruthWords: string[]): [number, number][] => {\n const normalizedTokens = tokens.map((t) => normalizeWord(t.text));\n const normalizedGTWords = groundTruthWords.map(normalizeWord);\n\n const lcsTable = buildLcsTable(normalizedTokens, normalizedGTWords);\n const lcsMatches = extractLcsMatches(lcsTable, normalizedTokens, normalizedGTWords);\n\n // Enforce hard constraints for first and last tokens.\n lcsMatches.set(0, 0);\n if (tokens.length > 1 && groundTruthWords.length > 1) {\n lcsMatches.set(tokens.length - 1, groundTruthWords.length - 1);\n }\n\n // Sort and filter to ensure anchors are strictly increasing.\n return Array.from(lcsMatches.entries())\n .sort((a, b) => a[0] - b[0])\n .filter((v, i, a) => !i || v[1] > a[i - 1][1]);\n};\n\n/**\n * Processes the segments (gaps) between a set of anchor points.\n * @returns An object containing the aligned tokens and the last processed indices.\n */\nconst processGaps = (\n tokens: Token[],\n groundTruthWords: string[],\n anchors: [number, number][],\n): {\n lastGtIndex: number;\n lastTokenIndex: number;\n result: GroundedToken[];\n} => {\n const result: GroundedToken[] = [];\n let lastTokenIndex = -1;\n let lastGtIndex = -1;\n\n const pushInsertion = (gtGap: string[], gtGapIndex: number, tokenGap: Token[], currentTokenIndex: number) => {\n return createInsertionToken(gtGap[gtGapIndex], {\n gtGap,\n gtGapIndex,\n nextToken: tokens[currentTokenIndex],\n prevToken: lastTokenIndex === -1 ? null : tokens[lastTokenIndex],\n tokenGap,\n });\n };\n\n for (const [currentTokenIndex, currentGtIndex] of anchors) {\n const tokenGap = tokens.slice(lastTokenIndex + 1, currentTokenIndex);\n const gtGap = groundTruthWords.slice(lastGtIndex + 1, currentGtIndex);\n\n let tokenGapIndex = 0;\n let gtGapIndex = 0;\n\n while (tokenGapIndex < tokenGap.length || gtGapIndex < gtGap.length) {\n if (tokenGapIndex >= tokenGap.length) {\n result.push(pushInsertion(gtGap, gtGapIndex, tokenGap, currentTokenIndex));\n gtGapIndex++;\n continue;\n }\n\n if (gtGapIndex >= gtGap.length) {\n result.push({ ...tokenGap[tokenGapIndex], isUnknown: true });\n tokenGapIndex++;\n continue;\n }\n\n result.push({ ...tokenGap[tokenGapIndex], text: gtGap[gtGapIndex] });\n tokenGapIndex++;\n gtGapIndex++;\n }\n\n result.push({\n ...tokens[currentTokenIndex],\n text: groundTruthWords[currentGtIndex],\n });\n\n lastTokenIndex = currentTokenIndex;\n lastGtIndex = currentGtIndex;\n }\n\n return { lastGtIndex, lastTokenIndex, result };\n};\n\n/**\n * Processes any remaining tokens and ground truth words after the last anchor.\n * This function mutates the `result` array by appending the final tokens.\n */\nconst processFinalTail = (\n result: GroundedToken[],\n tokens: Token[],\n groundTruthWords: string[],\n lastTokenIndex: number,\n lastGtIndex: number,\n): void => {\n const finalTokenGap = tokens.slice(lastTokenIndex + 1);\n const finalGtGap = groundTruthWords.slice(lastGtIndex + 1);\n\n // With enforced first/last anchors (when both sequences have length > 1), we should\n // not have any remaining ground-truth words after the last anchor. The only realistic\n // tail we can handle is extra tokens (mark as unknown).\n if (finalGtGap.length > 0) {\n return;\n }\n\n for (const token of finalTokenGap) {\n result.push({ ...token, isUnknown: true });\n }\n};\n\n/**\n * Distributes the words from the ground truth into their matching indices in the tokens.\n * If a token cannot be matched, it is marked with an `isUnknown` flag.\n * This function orchestrates the alignment process through a series of helper functions.\n *\n * @param tokens The word-by-word tokens from the AI.\n * @param groundTruth The human-agent verified text for the transcription.\n * @returns The corrected tokens with a best-effort of the ground truth values applied.\n */\nexport const syncTokensWithGroundTruth = (tokens: Token[], groundTruth: string): GroundedToken[] => {\n if (tokens.length === 0) {\n return [];\n }\n\n const groundTruthWords = tokenizeGroundTruth(groundTruth);\n if (groundTruthWords.length === 0) {\n return tokens.map((token) => ({ ...token, isUnknown: true }));\n }\n\n // 1. Find reliable alignment points (anchors).\n const anchors = findAnchors(tokens, groundTruthWords);\n\n // 2. Process the segments between the anchors.\n const { lastGtIndex, lastTokenIndex, result } = processGaps(tokens, groundTruthWords, anchors);\n\n // 3. Process any remaining tokens after the last anchor.\n processFinalTail(result, tokens, groundTruthWords, lastTokenIndex, lastGtIndex);\n\n return result;\n};\n","import type {\n GroundedSegment,\n MarkAndCombineSegmentsOptions,\n MarkedSegment,\n MarkedToken,\n MarkTokensWithDividersOptions,\n Segment,\n Token,\n} from './types';\n\nimport { ALWAYS_BREAK, SEGMENT_BREAK } from './utils/constants';\nimport { createHints, formatSecondsToTimestamp, isEndingWithPunctuation, normalizeTokenText } from './utils/textUtils';\nimport { isHintMatched, syncTokensWithGroundTruth } from './utils/transcriptUtils';\n\n/**\n * Estimates a segment with word-level tokens from a single token with multi-word text.\n * Splits the text by whitespace and calculates approximate timing for each word.\n *\n * @param {Token} param0 - The source token containing text with multiple words\n * @param {number} param0.end - End time of the token in seconds\n * @param {number} param0.start - Start time of the token in seconds\n * @param {string} param0.text - The multi-word text content\n * @returns {Segment} A segment with the original text and estimated word-level tokens\n */\nexport const estimateSegmentFromToken = ({ end, start, text }: Token): Segment => {\n const words = text.split(/\\s+/);\n const totalTokens = words.length;\n const segmentDuration = end - start;\n const tokenDuration = segmentDuration / totalTokens;\n\n const tokens = words.map((word, i) => ({\n end: start + (i + 1) * tokenDuration,\n start: start + i * tokenDuration,\n text: word,\n }));\n\n return { end, start, text, tokens };\n};\n\n/**\n * Marks tokens with segment dividers based on various criteria including:\n * - Filler words (uh, umm, etc.)\n * - Explicit multi-word hints\n * - Significant time gaps between tokens\n * - Punctuation at the end of tokens\n *\n * @param {Token[]} tokens - Array of tokens to process\n * @param {Object} options - Configuration options\n * @param {string[]} [options.fillers] - Optional array of filler words to mark as segment breaks\n * @param {number} options.gapThreshold - Minimum time gap (in seconds) to consider a segment break\n * @param {Hints} [options.hints] - Hints created with the createHints() function to indicate when to insert a new segment break.\n * @returns {MarkedToken[]} Tokens with segment break markers inserted\n */\nexport const markTokensWithDividers = (\n tokens: Token[],\n { fillers = [], gapThreshold, hints }: MarkTokensWithDividersOptions,\n): MarkedToken[] => {\n const marked: MarkedToken[] = [];\n let prevEnd: null | number = null;\n const normalizedTexts = hints ? tokens.map((t) => normalizeTokenText(t.text, hints.normalization)) : null;\n\n for (let idx = 0; idx < tokens.length; idx++) {\n const token = tokens[idx];\n\n // Filler words always break\n if (fillers.includes(token.text)) {\n marked.push(SEGMENT_BREAK);\n continue;\n }\n\n if (hints && normalizedTexts && isHintMatched(normalizedTexts, hints, idx)) {\n marked.push(ALWAYS_BREAK);\n }\n\n // Large time gap triggers a break\n if (prevEnd !== null && token.start - prevEnd > gapThreshold) {\n marked.push(SEGMENT_BREAK);\n }\n\n // Push the token itself\n marked.push(token);\n\n // Punctuation at end triggers a break\n if (isEndingWithPunctuation(token.text)) {\n marked.push(SEGMENT_BREAK);\n }\n\n prevEnd = token.end;\n }\n\n return marked;\n};\n\n/**\n * Groups marked tokens into segments based on maximum segment duration.\n * Creates segments from tokens, splitting when the duration exceeds the specified maximum.\n *\n * @param {MarkedToken[]} markedTokens - Array of tokens with segment break markers\n * @param {number} maxSecondsPerSegment - Maximum duration (in seconds) for a segment\n * @returns {MarkedSegment[]} Array of marked segments\n */\nexport const groupMarkedTokensIntoSegments = (\n markedTokens: MarkedToken[],\n maxSecondsPerSegment: number,\n): MarkedSegment[] => {\n const segments: MarkedSegment[] = [];\n let currentSegment: MarkedToken[] = [];\n let segmentStart: null | number = null;\n let segmentEnd: null | number = null;\n\n const flush = () => {\n if (currentSegment.length === 0) {\n return;\n }\n if (segmentStart === null || segmentEnd === null) {\n return;\n }\n segments.push({ end: segmentEnd, start: segmentStart, tokens: currentSegment });\n };\n\n const reset = () => {\n currentSegment = [];\n segmentStart = null;\n segmentEnd = null;\n };\n\n const durationExceeded = () => {\n if (segmentStart === null || segmentEnd === null) {\n return false;\n }\n return segmentEnd - segmentStart > maxSecondsPerSegment;\n };\n\n for (let i = 0; i < markedTokens.length; i++) {\n const token = markedTokens[i];\n const next = markedTokens[i + 1];\n const nextIsDivider = next === SEGMENT_BREAK || next === ALWAYS_BREAK;\n\n if (token === ALWAYS_BREAK) {\n flush();\n reset();\n currentSegment = [ALWAYS_BREAK];\n continue;\n }\n\n if (token !== SEGMENT_BREAK) {\n if (segmentStart === null) {\n segmentStart = token.start;\n }\n\n segmentEnd = token.end;\n }\n\n currentSegment.push(token);\n\n if (nextIsDivider && durationExceeded()) {\n flush();\n reset();\n }\n }\n\n flush();\n\n return segments;\n};\n\n/**\n * Merges segments with fewer than the specified minimum words into the previous segment.\n * This helps avoid very short segments that might break the flow of text.\n *\n * @param {MarkedSegment[]} segments - Array of marked segments to process\n * @param {number} minWordsPerSegment - Minimum number of words required for a segment to stand alone\n * @returns {MarkedSegment[]} Array of merged segments\n */\nexport const mergeShortSegmentsWithPrevious = (\n segments: MarkedSegment[],\n minWordsPerSegment: number,\n): MarkedSegment[] => {\n const result: MarkedSegment[] = [];\n\n for (const segment of segments) {\n const wordTokens = segment.tokens.filter((t) => t !== SEGMENT_BREAK && t !== ALWAYS_BREAK);\n\n const isHardBoundary = segment.tokens.includes(ALWAYS_BREAK);\n\n if (!isHardBoundary && wordTokens.length < minWordsPerSegment && result.length > 0) {\n const prev = result[result.length - 1];\n prev.tokens.push(...segment.tokens);\n prev.end = segment.end;\n } else {\n result.push({ ...segment });\n }\n }\n\n return result;\n};\n\nconst formatMarkedSegmentToLines = (\n segment: MarkedSegment,\n maxSecondsPerLine: number,\n formatTokens?: (buffer: Token) => string,\n): string[] => {\n const lines: string[] = [];\n let buffer: Token[] = [];\n let bufferStart: null | number = null;\n\n const pushBufferAsLine = () => {\n if (buffer.length === 0) {\n return;\n }\n\n const text = buffer.map((t) => t.text).join(' ');\n lines.push(\n formatTokens\n ? formatTokens({\n end: buffer.at(-1)!.end,\n start: buffer[0].start,\n text,\n })\n : `${formatSecondsToTimestamp(buffer[0].start)}: ${text}`,\n );\n\n buffer = [];\n bufferStart = null;\n };\n\n const shouldFlushOnSoftBreak = () => {\n if (buffer.length === 0) {\n return false;\n }\n const bufferEnd = buffer[buffer.length - 1].end;\n const duration = bufferStart !== null ? bufferEnd - bufferStart : 0;\n if (duration < maxSecondsPerLine) {\n return false;\n }\n return isEndingWithPunctuation(buffer[buffer.length - 1].text);\n };\n\n for (const token of segment.tokens) {\n if (token === ALWAYS_BREAK) {\n pushBufferAsLine();\n continue;\n }\n\n if (token === SEGMENT_BREAK) {\n if (shouldFlushOnSoftBreak()) {\n pushBufferAsLine();\n }\n continue;\n }\n\n if (bufferStart === null) {\n bufferStart = token.start;\n }\n buffer.push(token);\n }\n\n pushBufferAsLine();\n return lines;\n};\n\n/**\n * Formats segments into a timestamped transcript with timestamps at the beginning of each line.\n * Lines are split based on segment breaks and maximum line duration.\n *\n * @param {MarkedSegment[]} segments - Array of marked segments to format\n * @param {number} maxSecondsPerLine - Maximum duration (in seconds) for a single line\n * @param {(buffer: Token) => string} [formatTokens] - Optional formatter that receives the buffered token range\n * and returns the formatted line. When omitted the function emits timestamp-prefixed strings.\n * @returns {string} Formatted transcript with timestamps\n */\nexport const formatSegmentsToTimestampedTranscript = (\n segments: MarkedSegment[],\n maxSecondsPerLine: number,\n formatTokens?: (buffer: Token) => string,\n): string => {\n return segments\n .flatMap((segment) => formatMarkedSegmentToLines(segment, maxSecondsPerLine, formatTokens))\n .join('\\n');\n};\n\n/**\n * Maps marked segments into formatted segments with clean text representation.\n * Combines the tokens into properly formatted text, respecting segment breaks\n * and optional maximum line duration.\n *\n * @param {MarkedSegment[]} segments - Array of marked segments to format\n * @param {number} [maxSecondsPerLine] - Optional maximum duration (in seconds) for a single line\n * @returns {Segment[]} Array of formatted segments with clean text\n */\nexport const mapSegmentsIntoFormattedSegments = (segments: MarkedSegment[], maxSecondsPerLine?: number): Segment[] => {\n return segments.map((segment) => {\n const textParts: string[] = [];\n const flattenedTokens: Token[] = [];\n let buffer: Token[] = [];\n let bufferStart: null | number = null;\n\n const pushBufferAsLine = () => {\n if (buffer.length > 0) {\n textParts.push(buffer.map((t) => t.text).join(' '));\n buffer = [];\n bufferStart = null;\n }\n };\n\n const shouldFlushOnSoftBreak = () => {\n if (!maxSecondsPerLine) {\n return true;\n }\n if (buffer.length === 0) {\n return false;\n }\n const bufferEnd = buffer[buffer.length - 1].end;\n const duration = bufferStart !== null ? bufferEnd - bufferStart : 0;\n return duration > maxSecondsPerLine;\n };\n\n for (const token of segment.tokens) {\n if (token === ALWAYS_BREAK) {\n pushBufferAsLine();\n continue;\n }\n\n if (token === SEGMENT_BREAK) {\n if (shouldFlushOnSoftBreak()) {\n pushBufferAsLine();\n }\n continue;\n }\n\n if (bufferStart === null) {\n bufferStart = token.start;\n }\n buffer.push(token);\n flattenedTokens.push(token);\n }\n\n pushBufferAsLine();\n\n return {\n end: segment.end,\n start: segment.start,\n text: textParts.join('\\n'),\n tokens: flattenedTokens,\n };\n });\n};\n\n/**\n * Convenience function that processes segments through all steps:\n * marking tokens with dividers, grouping into segments, and merging short segments.\n *\n * @param {Segment[]} segments - Array of input segments to process\n * @param {Object} options - Configuration options\n * @param {string[]} options.fillers - Array of filler words to mark as segment breaks\n * @param {number} options.gapThreshold - Minimum time gap (in seconds) to consider a segment break\n * @param {number} options.maxSecondsPerSegment - Maximum duration (in seconds) for a segment\n * @param {number} options.minWordsPerSegment - Minimum number of words required for a segment to stand alone\n * @returns {MarkedSegment[]} Array of processed and marked segments\n */\nexport const markAndCombineSegments = (segments: Segment[], options: MarkAndCombineSegmentsOptions) => {\n const tokens = segments.flatMap((segment) => segment.tokens!);\n let markedTokens = markTokensWithDividers(tokens, {\n fillers: options.fillers,\n gapThreshold: options.gapThreshold,\n ...(options.hints && { hints: options.hints }),\n });\n markedTokens = cleanupIsolatedTokens(markedTokens);\n const markedSegments = groupMarkedTokensIntoSegments(markedTokens, options.maxSecondsPerSegment);\n const combinedSegments = mergeShortSegmentsWithPrevious(markedSegments, options.minWordsPerSegment);\n\n return combinedSegments;\n};\n\n/**\n * Cleans up marked tokens by removing unnecessary segment breaks that would\n * cause individual tokens to appear on their own lines.\n *\n * @param {MarkedToken[]} markedTokens - The array of marked tokens to clean up\n * @returns {MarkedToken[]} A new array with unnecessary breaks removed\n */\nexport const cleanupIsolatedTokens = (markedTokens: MarkedToken[]): MarkedToken[] => {\n const result: MarkedToken[] = [];\n\n for (let i = 0; i < markedTokens.length; i++) {\n const current = markedTokens[i];\n const next = markedTokens[i + 1];\n const future = markedTokens[i + 2];\n\n if (current === SEGMENT_BREAK && (next === ALWAYS_BREAK || next === SEGMENT_BREAK)) {\n // skip current break since we're placing a break anyways\n } else if (current === SEGMENT_BREAK && (future === SEGMENT_BREAK || future === ALWAYS_BREAK || !future)) {\n // skip current break since we don't want to put a word by itself\n } else if (current === SEGMENT_BREAK && result.at(-1) === SEGMENT_BREAK) {\n // skip duplicate break\n } else {\n result.push(current);\n }\n }\n\n return result;\n};\n\n/**\n * Aligns AI-generated tokens to a ground truth human-edited segment text.\n *\n * Uses Longest Common Subsequence (LCS) to identify anchor matches between\n * tokenized output and ground truth. Where no matches exist, it interpolates\n * timestamped tokens for unmatched words.\n *\n * @param segment - A `Segment` object with ground truth `text` and AI-generated `tokens`\n * @param groundTruth - The ground truth text to apply to the segment's text and its tokens.\n * @returns A new `GroundedSegment` with the `tokens` adjusted to match the ground truth `text`\n * along with any unmatched tokens flagged.\n */\nexport const updateSegmentWithGroundTruth = (segment: Segment, groundTruth: string): GroundedSegment => {\n return {\n end: segment.end,\n start: segment.start,\n text: groundTruth,\n tokens: syncTokensWithGroundTruth(segment.tokens, groundTruth),\n };\n};\n\n/**\n * Produces a segment with the ground truth replacing the text and its respective tokens.\n * @param segment The segment to replace the ground truth with.\n * @param groundTruth The human verified transcription of the segment.\n * @returns A segment with the ground truth applies to the segment text and its tokens.\n */\nexport const applyGroundTruthToSegment = (segment: Segment, groundTruth: string): Segment => {\n const result = updateSegmentWithGroundTruth(segment, groundTruth);\n return { ...result, tokens: result.tokens.filter((t) => !t.isUnknown) };\n};\n\n/**\n * Merges multiple segments into a single segment.\n *\n * @param segments - Array of segments to merge into one\n * @param delimiter - Optional string to join segment texts (defaults to space)\n * @returns A single merged segment containing all tokens\n */\nexport const mergeSegments = (segments: Segment[], delimiter = ' '): Segment => {\n const text = segments.map((segment) => segment.text).join(delimiter);\n const tokens = segments.flatMap((segment) => segment.tokens);\n\n return {\n end: segments.at(-1)!.end,\n start: segments[0].start,\n text,\n tokens,\n };\n};\n\n/**\n * Splits a segment at a specific time point into exactly two segments.\n *\n * This function does the opposite of mergeSegments, taking a single segment\n * and dividing it into two segments at the specified split time.\n *\n * @param segment - The segment to split\n * @param splitTime - The time (in seconds) at which to split the segment\n * @returns An array containing exactly two segments\n */\nexport const splitSegment = (segment: Segment, splitTime: number): Segment[] => {\n const firstTokens = segment.tokens.filter((token) => token.start < splitTime);\n const secondTokens = segment.tokens.filter((token) => token.start >= splitTime);\n\n const firstText = firstTokens.map((token) => token.text).join(' ');\n const secondText = secondTokens.map((token) => token.text).join(' ');\n\n return [\n {\n end: firstTokens.at(-1)!.end,\n start: segment.start,\n text: firstText,\n tokens: firstTokens,\n },\n {\n end: segment.end,\n start: secondTokens[0].start,\n text: secondText,\n tokens: secondTokens,\n },\n ];\n};\n\n/**\n * Searches through an array of tokens and returns the first one whose text sequence\n * matches the given query string.\n *\n * This function will split the `query` into one or more hint phrases (via `createHints`),\n * then scan `tokens` in order, returning the first token at which any hint sequence\n * fully matches the subsequent tokens.\n *\n * @param tokens\n * An ordered array of `Token` objects to search.\n * @param query\n * A string containing one or more words to match. If you pass multiple words\n * (e.g. `\"hello world\"`), it will only match if `\"hello\"` at position `i` is\n * immediately followed by `\"world\"` at position `i+1`.\n * @returns\n * The first `Token` in the array where the hint sequence matches, or `null`\n * if no matching sequence is found.\n *\n * @example\n * ```ts\n * const tokens: Token[] = [\n * { start: 0, end: 1, text: 'the' },\n * { start: 1, end: 2, text: 'quick' },\n * { start: 2, end: 3, text: 'brown' },\n * { start: 3, end: 4, text: 'fox' },\n * ];\n *\n * getFirstMatchingToken(tokens, 'quick brown');\n * // → { start: 1, end: 2, text: 'quick' }\n *\n * getFirstMatchingToken(tokens, 'lazy dog');\n * // → null\n * ```\n */\nexport const getFirstMatchingToken = (tokens: Token[], query: string): null | Token => {\n const hints = createHints(query);\n const normalizedTexts = tokens.map((t) => normalizeTokenText(t.text, hints.normalization));\n\n for (let i = 0; i < tokens.length; i++) {\n if (isHintMatched(normalizedTexts, hints, i)) {\n return tokens[i];\n }\n }\n\n return null;\n};\n\n/**\n * Finds and returns the first token in a segment whose character‐range fully contains\n * the given [selectionStart, selectionEnd) range.\n *\n * This is useful when you have a selection in the raw `segment.text` (for example, from\n * an <input>’s `selectionStart` and `selectionEnd`) and you want to map that back to the\n * corresponding timed `Token`.\n *\n * @param segment The Segment object containing the full `text` and an ordered list of `tokens`.\n * @param selectionStart\n * The zero‐based index into `segment.text` where the selection begins (inclusive).\n * @param selectionEnd\n * The zero‐based index into `segment.text` where the selection ends (exclusive).\n * @returns\n * The first `Token` whose span in `segment.text` covers the entire selection range or `null` if it is not found.\n *\n * @example\n * ```ts\n * const segment: Segment = {\n * text: 'the fox and the rabbit',\n * start: 0,\n * end: 6,\n * tokens: [\n * { start: 0, end: 1, text: 'the' },\n * { start: 2, end: 3, text: 'fox' },\n * { start: 3, end: 4, text: 'and' },\n * { start: 4, end: 5, text: 'the' },\n * { start: 5, end: 6, text: 'rabbit' },\n * ],\n * };\n *\n * // Suppose the user selected the second \"the\" in an <input>,\n * // which corresponds to characters 12–15 (exclusive end):\n * const tok = getFirstTokenForSelection(segment, 12, 15);\n * // tok === { start: 4, end: 5, text: 'the' }\n * ```\n */\nexport const getFirstTokenForSelection = (\n segment: Segment,\n selectionStart: number,\n selectionEnd: number, // exclusive\n): null | Token => {\n const { text, tokens } = segment;\n\n // Keep track of where we last matched, so duplicate words\n // resolve to the correct occurrence.\n let searchPos = 0;\n\n for (const token of tokens) {\n // Find the next occurrence of this token in the text\n const charStart = text.indexOf(token.text, searchPos);\n\n if (charStart === -1) {\n continue; // mismatch guard\n }\n\n const charEnd = charStart + token.text.length; // exclusive\n\n // Advance past this token (plus one for the space separator)\n searchPos = charEnd + 1;\n\n // Because selectionEnd is exclusive, we can test containment simply:\n if (selectionStart >= charStart && selectionEnd <= charEnd) {\n return token;\n }\n }\n\n return null;\n};\n","import type { GeneratedHint, GenerateHintsOptions, Segment, Token } from '@/types';\n\nimport { normalizeTokenText } from './textUtils';\n\ntype CandidateStats = {\n count: number;\n firstOccurrenceIndex: number;\n // Tracking occurrences is needed for safe closed-dedupe; we cap and refuse to dedupe if truncated.\n occurrenceIndices: number[];\n occurrencesTruncated: boolean;\n surfaceCounts: Map<string, number>;\n};\n\ntype InternalOptions = Required<Pick<GenerateHintsOptions, 'dedupe' | 'maxN' | 'minCount' | 'minN' | 'topK'>> & {\n normalization: Required<NonNullable<GenerateHintsOptions['normalization']>>;\n stopwords: string[];\n};\n\nconst DEFAULT_NORMALIZATION: Required<NonNullable<GenerateHintsOptions['normalization']>> = {\n normalizeAlef: true,\n normalizeHamza: false,\n normalizeYa: true,\n removeTatweel: true,\n};\n\nconst DEFAULTS: Required<Pick<GenerateHintsOptions, 'dedupe' | 'maxN' | 'minCount' | 'minN'>> = {\n dedupe: 'closed',\n maxN: 6,\n minCount: 2,\n minN: 2,\n};\n\nconst OCCURRENCE_CAP_FOR_DEDUPE = 5000;\nconst SURFACE_VARIANTS_CAP = 5;\n\nconst makeKey = (normalizedWords: string[]): string => JSON.stringify(normalizedWords);\n\nconst parseKey = (key: string): string[] => JSON.parse(key) as string[];\n\nconst isAllStopwords = (words: string[], stopwords: string[]) => {\n if (stopwords.length === 0) {\n return false;\n }\n return words.every((w) => stopwords.includes(w));\n};\n\nconst pickTopSurfaces = (surfaceCounts: Map<string, number>, max = 3): string[] => {\n return Array.from(surfaceCounts.entries())\n .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))\n .slice(0, max)\n .map(([surface]) => surface);\n};\n\nconst addSurfaceVariant = (stats: CandidateStats, surface: string) => {\n const current = stats.surfaceCounts.get(surface);\n if (current !== undefined) {\n stats.surfaceCounts.set(surface, current + 1);\n return;\n }\n\n // Keep surfaceCounts bounded to avoid unbounded growth from punctuation variance.\n if (stats.surfaceCounts.size >= SURFACE_VARIANTS_CAP) {\n // If full, replace the lowest-frequency entry only if this one is likely to be common.\n // We don't know that yet, so just ignore new variants when full.\n return;\n }\n\n stats.surfaceCounts.set(surface, 1);\n};\n\nconst recordOccurrence = (stats: CandidateStats, index: number) => {\n if (stats.occurrenceIndices.length < OCCURRENCE_CAP_FOR_DEDUPE) {\n stats.occurrenceIndices.push(index);\n } else {\n stats.occurrencesTruncated = true;\n }\n};\n\nconst canClosedDedupe = (stats: CandidateStats) => !stats.occurrencesTruncated;\n\nconst arrayToSet = (items: number[]) => new Set(items);\n\nconst setEquals = (a: Set<number>, b: Set<number>) => a.size === b.size && Array.from(a).every((item) => b.has(item));\n\nconst getKeyLength = (key: string, cache: Map<string, number>): number => {\n const existing = cache.get(key);\n if (existing !== undefined) {\n return existing;\n }\n const len = parseKey(key).length;\n cache.set(key, len);\n return len;\n};\n\nconst getSortedCandidatesForDedupe = (\n candidates: Map<string, CandidateStats>,\n keyLengthCache: Map<string, number>,\n): [string, CandidateStats][] => {\n return Array.from(candidates.entries()).sort((a, b) => {\n const lenA = getKeyLength(a[0], keyLengthCache);\n const lenB = getKeyLength(b[0], keyLengthCache);\n return lenB - lenA || b[1].count - a[1].count;\n });\n};\n\nconst derivedStartSetForOffset = (longStarts: Set<number>, offset: number): Set<number> => {\n const derived = new Set<number>();\n for (const start of longStarts) {\n derived.add(start + offset);\n }\n return derived;\n};\n\nconst isSubphraseRemovableAtOffset = (\n candidates: Map<string, CandidateStats>,\n longStats: CandidateStats,\n longWords: string[],\n offset: number,\n subLen: number,\n): string | null => {\n const subKey = makeKey(longWords.slice(offset, offset + subLen));\n const subStats = candidates.get(subKey);\n if (!subStats) {\n return null;\n }\n if (subStats.count !== longStats.count) {\n return null;\n }\n\n const longStarts = arrayToSet(longStats.occurrenceIndices);\n const derived = derivedStartSetForOffset(longStarts, offset);\n const subStarts = arrayToSet(subStats.occurrenceIndices);\n\n return setEquals(derived, subStarts) ? subKey : null;\n};\n\nconst applyClosedDedupSafe = (candidates: Map<string, CandidateStats>): Set<string> => {\n const removable = new Set<string>();\n const keyLengthCache = new Map<string, number>();\n const sorted = getSortedCandidatesForDedupe(candidates, keyLengthCache);\n\n for (const [longKey, longStats] of sorted) {\n if (!canClosedDedupe(longStats)) {\n continue;\n }\n\n const longWords = parseKey(longKey);\n const longLen = longWords.length;\n\n for (let subLen = 2; subLen < longLen; subLen++) {\n for (let offset = 0; offset + subLen <= longLen; offset++) {\n const subKey = isSubphraseRemovableAtOffset(candidates, longStats, longWords, offset, subLen);\n if (subKey) {\n removable.add(subKey);\n }\n }\n }\n }\n\n return removable;\n};\n\nconst resolveOptions = (options?: GenerateHintsOptions): InternalOptions => {\n return {\n dedupe: options?.dedupe ?? DEFAULTS.dedupe,\n maxN: options?.maxN ?? DEFAULTS.maxN,\n minCount: options?.minCount ?? DEFAULTS.minCount,\n minN: options?.minN ?? DEFAULTS.minN,\n normalization: { ...DEFAULT_NORMALIZATION, ...(options?.normalization ?? {}) },\n stopwords: options?.stopwords ?? [],\n topK: options?.topK ?? Number.POSITIVE_INFINITY,\n };\n};\n\nconst normalizeTokenStream = (tokens: Token[], options: InternalOptions) =>\n tokens.map((t) => normalizeTokenText(t.text, options.normalization));\n\nconst countNgrams = (normalizedTokens: string[], options: InternalOptions): Map<string, number> => {\n const counts = new Map<string, number>();\n\n for (let i = 0; i < normalizedTokens.length; i++) {\n for (let n = options.minN; n <= options.maxN; n++) {\n if (i + n > normalizedTokens.length) {\n break;\n }\n const slice = normalizedTokens.slice(i, i + n);\n if (slice.some((s) => !s)) {\n continue;\n }\n if (isAllStopwords(slice, options.stopwords)) {\n continue;\n }\n const key = makeKey(slice);\n counts.set(key, (counts.get(key) ?? 0) + 1);\n }\n }\n\n return counts;\n};\n\nconst selectCandidateKeys = (counts: Map<string, number>, minCount: number): Set<string> => {\n const candidateKeys = new Set<string>();\n for (const [key, count] of counts) {\n if (count >= minCount) {\n candidateKeys.add(key);\n }\n }\n return candidateKeys;\n};\n\nconst initCandidateStats = (counts: Map<string, number>, candidateKeys: Set<string>): Map<string, CandidateStats> => {\n const candidates = new Map<string, CandidateStats>();\n for (const key of candidateKeys) {\n candidates.set(key, {\n count: counts.get(key)!,\n firstOccurrenceIndex: Number.POSITIVE_INFINITY,\n occurrenceIndices: [],\n occurrencesTruncated: false,\n surfaceCounts: new Map(),\n });\n }\n return candidates;\n};\n\nconst collectCandidateStats = (\n tokens: Token[],\n normalizedTokens: string[],\n options: InternalOptions,\n candidateKeys: Set<string>,\n counts: Map<string, number>,\n): Map<string, CandidateStats> => {\n const candidates = initCandidateStats(counts, candidateKeys);\n\n const collectAt = (startIndex: number, n: number) => {\n const slice = normalizedTokens.slice(startIndex, startIndex + n);\n if (slice.some((s) => !s)) {\n return;\n }\n\n const key = makeKey(slice);\n if (!candidateKeys.has(key)) {\n return;\n }\n\n const stats = candidates.get(key)!;\n\n stats.firstOccurrenceIndex = Math.min(stats.firstOccurrenceIndex, startIndex);\n recordOccurrence(stats, startIndex);\n\n const surface = tokens\n .slice(startIn