UNPKG

paragrafs

Version:

A lightweight TypeScript library designed to reconstruct paragraphs from AI transcriptions.

406 lines 17.3 kB
//#region src/utils/constants.d.ts /** * Constant used to mark segment breaks during processing. */ declare const SEGMENT_BREAK = "SEGMENT_BREAK"; type SegmentBreakMarker = typeof SEGMENT_BREAK; /** * Constant used to mark that we should always start a break when encountering this. */ declare const ALWAYS_BREAK = "ALWAYS_BREAK"; type AlwaysBreakMarker = typeof ALWAYS_BREAK; //#endregion //#region src/types.d.ts /** * Represents a segment that was updated with the ground truth values. */ type GroundedSegment = Omit<Segment, 'tokens'> & { tokens: GroundedToken[]; }; /** * Represents a token that was matched or unmatched during sync with the ground truth value. */ type GroundedToken = Token & { /** If this is true it means this token was not matched during the ground truth syncing */ isUnknown?: boolean; }; type HintMap = Record<string, string[][]>; type ArabicNormalizationOptions = { normalizeAlef?: boolean; normalizeHamza?: boolean; normalizeYa?: boolean; removeTatweel?: boolean; }; type GenerateHintsOptions = { boundaryStrategy?: 'none' | 'segment'; dedupe?: 'closed' | 'none'; maxN?: number; minCount?: number; minN?: number; normalization?: ArabicNormalizationOptions; stopwords?: string[]; topK?: number; }; type Hints = { map: HintMap; normalization: Required<ArabicNormalizationOptions>; }; type GeneratedHint = { count: number; firstOccurrenceIndex?: number; length: number; normalizedPhrase: string; phrase: string; topSurfaceForms?: string[]; }; /** * Represents a segment during the marking and processing stage. * Contains an array of tokens that may include segment break markers. */ type MarkedSegment = { /** * End time of the segment in seconds */ end: number; /** * Start time of the segment in seconds */ start: number; /** * Array of tokens and segment break markers that make up this segment */ tokens: MarkedToken[]; }; /** * Represents either a token or a segment break marker. * Used during the processing of text to identify natural break points. */ type MarkedToken = Token | AlwaysBreakMarker | SegmentBreakMarker; type MarkTokensWithDividersOptions = { fillers?: string[]; gapThreshold: number; hints?: Hints; }; type MarkAndCombineSegmentsOptions = MarkTokensWithDividersOptions & { maxSecondsPerSegment: number; minWordsPerSegment: number; }; /** * Represents a segment of text with timing information and optional word-level tokens. * A segment is a higher-level structure that contains a sequence of related tokens. */ type Segment = Token & { /** * Word-by-word breakdown of the transcription with individual timings */ tokens: Token[]; }; /** * Represents a single token (word or phrase) with timing information. * This is the basic unit of transcribed text. */ type Token = { /** * End time in seconds. */ end: number; /** * Start time in seconds. */ start: number; /** * The transcribed text */ text: string; }; //#endregion //#region src/transcript.d.ts /** * Estimates a segment with word-level tokens from a single token with multi-word text. * Splits the text by whitespace and calculates approximate timing for each word. * * @param {Token} param0 - The source token containing text with multiple words * @param {number} param0.end - End time of the token in seconds * @param {number} param0.start - Start time of the token in seconds * @param {string} param0.text - The multi-word text content * @returns {Segment} A segment with the original text and estimated word-level tokens */ declare const estimateSegmentFromToken: ({ end, start, text }: Token) => Segment; /** * Marks tokens with segment dividers based on various criteria including: * - Filler words (uh, umm, etc.) * - Explicit multi-word hints * - Significant time gaps between tokens * - Punctuation at the end of tokens * * @param {Token[]} tokens - Array of tokens to process * @param {Object} options - Configuration options * @param {string[]} [options.fillers] - Optional array of filler words to mark as segment breaks * @param {number} options.gapThreshold - Minimum time gap (in seconds) to consider a segment break * @param {Hints} [options.hints] - Hints created with the createHints() function to indicate when to insert a new segment break. * @returns {MarkedToken[]} Tokens with segment break markers inserted */ declare const markTokensWithDividers: (tokens: Token[], { fillers, gapThreshold, hints }: MarkTokensWithDividersOptions) => MarkedToken[]; /** * Groups marked tokens into segments based on maximum segment duration. * Creates segments from tokens, splitting when the duration exceeds the specified maximum. * * @param {MarkedToken[]} markedTokens - Array of tokens with segment break markers * @param {number} maxSecondsPerSegment - Maximum duration (in seconds) for a segment * @returns {MarkedSegment[]} Array of marked segments */ declare const groupMarkedTokensIntoSegments: (markedTokens: MarkedToken[], maxSecondsPerSegment: number) => MarkedSegment[]; /** * Merges segments with fewer than the specified minimum words into the previous segment. * This helps avoid very short segments that might break the flow of text. * * @param {MarkedSegment[]} segments - Array of marked segments to process * @param {number} minWordsPerSegment - Minimum number of words required for a segment to stand alone * @returns {MarkedSegment[]} Array of merged segments */ declare const mergeShortSegmentsWithPrevious: (segments: MarkedSegment[], minWordsPerSegment: number) => MarkedSegment[]; /** * Formats segments into a timestamped transcript with timestamps at the beginning of each line. * Lines are split based on segment breaks and maximum line duration. * * @param {MarkedSegment[]} segments - Array of marked segments to format * @param {number} maxSecondsPerLine - Maximum duration (in seconds) for a single line * @param {(buffer: Token) => string} [formatTokens] - Optional formatter that receives the buffered token range * and returns the formatted line. When omitted the function emits timestamp-prefixed strings. * @returns {string} Formatted transcript with timestamps */ declare const formatSegmentsToTimestampedTranscript: (segments: MarkedSegment[], maxSecondsPerLine: number, formatTokens?: (buffer: Token) => string) => string; /** * Maps marked segments into formatted segments with clean text representation. * Combines the tokens into properly formatted text, respecting segment breaks * and optional maximum line duration. * * @param {MarkedSegment[]} segments - Array of marked segments to format * @param {number} [maxSecondsPerLine] - Optional maximum duration (in seconds) for a single line * @returns {Segment[]} Array of formatted segments with clean text */ declare const mapSegmentsIntoFormattedSegments: (segments: MarkedSegment[], maxSecondsPerLine?: number) => Segment[]; /** * Convenience function that processes segments through all steps: * marking tokens with dividers, grouping into segments, and merging short segments. * * @param {Segment[]} segments - Array of input segments to process * @param {Object} options - Configuration options * @param {string[]} options.fillers - Array of filler words to mark as segment breaks * @param {number} options.gapThreshold - Minimum time gap (in seconds) to consider a segment break * @param {number} options.maxSecondsPerSegment - Maximum duration (in seconds) for a segment * @param {number} options.minWordsPerSegment - Minimum number of words required for a segment to stand alone * @returns {MarkedSegment[]} Array of processed and marked segments */ declare const markAndCombineSegments: (segments: Segment[], options: MarkAndCombineSegmentsOptions) => MarkedSegment[]; /** * Cleans up marked tokens by removing unnecessary segment breaks that would * cause individual tokens to appear on their own lines. * * @param {MarkedToken[]} markedTokens - The array of marked tokens to clean up * @returns {MarkedToken[]} A new array with unnecessary breaks removed */ declare const cleanupIsolatedTokens: (markedTokens: MarkedToken[]) => MarkedToken[]; /** * Aligns AI-generated tokens to a ground truth human-edited segment text. * * Uses Longest Common Subsequence (LCS) to identify anchor matches between * tokenized output and ground truth. Where no matches exist, it interpolates * timestamped tokens for unmatched words. * * @param segment - A `Segment` object with ground truth `text` and AI-generated `tokens` * @param groundTruth - The ground truth text to apply to the segment's text and its tokens. * @returns A new `GroundedSegment` with the `tokens` adjusted to match the ground truth `text` * along with any unmatched tokens flagged. */ declare const updateSegmentWithGroundTruth: (segment: Segment, groundTruth: string) => GroundedSegment; /** * Produces a segment with the ground truth replacing the text and its respective tokens. * @param segment The segment to replace the ground truth with. * @param groundTruth The human verified transcription of the segment. * @returns A segment with the ground truth applies to the segment text and its tokens. */ declare const applyGroundTruthToSegment: (segment: Segment, groundTruth: string) => Segment; /** * Merges multiple segments into a single segment. * * @param segments - Array of segments to merge into one * @param delimiter - Optional string to join segment texts (defaults to space) * @returns A single merged segment containing all tokens */ declare const mergeSegments: (segments: Segment[], delimiter?: string) => Segment; /** * Splits a segment at a specific time point into exactly two segments. * * This function does the opposite of mergeSegments, taking a single segment * and dividing it into two segments at the specified split time. * * @param segment - The segment to split * @param splitTime - The time (in seconds) at which to split the segment * @returns An array containing exactly two segments */ declare const splitSegment: (segment: Segment, splitTime: number) => Segment[]; /** * Searches through an array of tokens and returns the first one whose text sequence * matches the given query string. * * This function will split the `query` into one or more hint phrases (via `createHints`), * then scan `tokens` in order, returning the first token at which any hint sequence * fully matches the subsequent tokens. * * @param tokens * An ordered array of `Token` objects to search. * @param query * A string containing one or more words to match. If you pass multiple words * (e.g. `"hello world"`), it will only match if `"hello"` at position `i` is * immediately followed by `"world"` at position `i+1`. * @returns * The first `Token` in the array where the hint sequence matches, or `null` * if no matching sequence is found. * * @example * ```ts * const tokens: Token[] = [ * { start: 0, end: 1, text: 'the' }, * { start: 1, end: 2, text: 'quick' }, * { start: 2, end: 3, text: 'brown' }, * { start: 3, end: 4, text: 'fox' }, * ]; * * getFirstMatchingToken(tokens, 'quick brown'); * // → { start: 1, end: 2, text: 'quick' } * * getFirstMatchingToken(tokens, 'lazy dog'); * // → null * ``` */ declare const getFirstMatchingToken: (tokens: Token[], query: string) => null | Token; /** * Finds and returns the first token in a segment whose character‐range fully contains * the given [selectionStart, selectionEnd) range. * * This is useful when you have a selection in the raw `segment.text` (for example, from * an <input>’s `selectionStart` and `selectionEnd`) and you want to map that back to the * corresponding timed `Token`. * * @param segment The Segment object containing the full `text` and an ordered list of `tokens`. * @param selectionStart * The zero‐based index into `segment.text` where the selection begins (inclusive). * @param selectionEnd * The zero‐based index into `segment.text` where the selection ends (exclusive). * @returns * The first `Token` whose span in `segment.text` covers the entire selection range or `null` if it is not found. * * @example * ```ts * const segment: Segment = { * text: 'the fox and the rabbit', * start: 0, * end: 6, * tokens: [ * { start: 0, end: 1, text: 'the' }, * { start: 2, end: 3, text: 'fox' }, * { start: 3, end: 4, text: 'and' }, * { start: 4, end: 5, text: 'the' }, * { start: 5, end: 6, text: 'rabbit' }, * ], * }; * * // Suppose the user selected the second "the" in an <input>, * // which corresponds to characters 12–15 (exclusive end): * const tok = getFirstTokenForSelection(segment, 12, 15); * // tok === { start: 4, end: 5, text: 'the' } * ``` */ declare const getFirstTokenForSelection: (segment: Segment, selectionStart: number, selectionEnd: number) => null | Token; //#endregion //#region src/utils/hints.d.ts /** * Mine frequent n-grams from a token stream and return hint candidates sorted by frequency. * * This is Arabic-first: mining is performed on normalized token text. * The returned `phrase` is the most common surface form observed for that normalized phrase. * * Breaking-change note: defaults favor Arabic ASR robustness (alef/ya normalization + tatweel stripping). */ declare const generateHintsFromTokens: (tokens: Token[], options?: GenerateHintsOptions) => GeneratedHint[]; /** * Mine frequent n-grams from segments. * * If `boundaryStrategy` is `'segment'` (default), phrases cannot cross segment boundaries. */ declare const generateHintsFromSegments: (segments: Segment[], options?: GenerateHintsOptions) => GeneratedHint[]; //#endregion //#region src/utils/textUtils.d.ts /** * Checks if a text string ends with sentence-ending punctuation. * Supports: period (.), question mark (? / ؟), exclamation (!), Arabic semicolon (؛), ellipsis (…). * * @param {string} text - The text to check for ending punctuation * @returns {boolean} True if the text ends with punctuation, false otherwise */ declare const isEndingWithPunctuation: (text: string) => boolean; /** * Formats seconds into a human-readable timestamp. * For durations less than an hour: m:ss (e.g., "1:05") * For durations an hour or longer: h:mm:ss (e.g., "1:02:05") * * @param {number} seconds - The time duration in seconds * @returns {string} Formatted timestamp string */ declare const formatSecondsToTimestamp: (seconds: number) => string; /** * Strip leading/trailing punctuation/symbols, remove Arabic diacritics, NFC-normalize. * Normalizes a word by removing diacritics and punctuation. * * This function: * 1. Decomposes Unicode characters (NFD normalization) * 2. Removes Arabic diacritics * 3. Strips leading and trailing punctuation or symbols * 4. Recomposes Unicode characters (NFC normalization) * * @param {string} w - The word to normalize * @returns {string} The normalized word */ declare const normalizeWord: (w: string) => string; /** * Normalizes token text for Arabic-first matching and mining. * * This builds on `normalizeWord` (diacritics + trim punctuation) and adds optional * Arabic-specific normalizations. Use the same normalization for: * - mining repeated sequences * - matching hints against tokens * * @param text The token text to normalize * @param options Optional Arabic-specific normalizations * @returns A normalized token string suitable for comparisons */ declare const normalizeTokenText: (text: string, options?: ArabicNormalizationOptions) => string; /** * Creates normalized hints for robust Arabic matching (diacritics/punctuation tolerant). * * Breaking change: hints are now normalized by default. This is intended for Arabic ASR. * * @param first Either the first hint string, or an options object overriding the default normalization. * @param restHints Remaining hint strings, if the first argument was an options object. * @returns A normalized hint map plus the normalization settings used for matching. */ declare const createHints: (first: ArabicNormalizationOptions | string, ...restHints: string[]) => Hints; /** * Tokenizes ground truth text properly, ensuring punctuation is attached to words * rather than creating separate tokens. * @param groundTruth The ground truth to tokenize. * @returns The tokenized ground truth with the punctuations properly attached. */ declare const tokenizeGroundTruth: (groundTruth: string) => string[]; //#endregion export { ArabicNormalizationOptions, GenerateHintsOptions, GeneratedHint, GroundedSegment, GroundedToken, HintMap, Hints, MarkAndCombineSegmentsOptions, MarkTokensWithDividersOptions, MarkedSegment, MarkedToken, Segment, Token, applyGroundTruthToSegment, cleanupIsolatedTokens, createHints, estimateSegmentFromToken, formatSecondsToTimestamp, formatSegmentsToTimestampedTranscript, generateHintsFromSegments, generateHintsFromTokens, getFirstMatchingToken, getFirstTokenForSelection, groupMarkedTokensIntoSegments, isEndingWithPunctuation, mapSegmentsIntoFormattedSegments, markAndCombineSegments, markTokensWithDividers, mergeSegments, mergeShortSegmentsWithPrevious, normalizeTokenText, normalizeWord, splitSegment, tokenizeGroundTruth, updateSegmentWithGroundTruth }; //# sourceMappingURL=index.d.mts.map