paragrafs
Version:
A lightweight TypeScript library designed to reconstruct paragraphs from AI transcriptions.
406 lines • 17.3 kB
text/typescript
//#region src/utils/constants.d.ts
/**
* Constant used to mark segment breaks during processing.
*/
declare const SEGMENT_BREAK = "SEGMENT_BREAK";
type SegmentBreakMarker = typeof SEGMENT_BREAK;
/**
* Constant used to mark that we should always start a break when encountering this.
*/
declare const ALWAYS_BREAK = "ALWAYS_BREAK";
type AlwaysBreakMarker = typeof ALWAYS_BREAK;
//#endregion
//#region src/types.d.ts
/**
* Represents a segment that was updated with the ground truth values.
*/
type GroundedSegment = Omit<Segment, 'tokens'> & {
tokens: GroundedToken[];
};
/**
* Represents a token that was matched or unmatched during sync with the ground truth value.
*/
type GroundedToken = Token & {
/** If this is true it means this token was not matched during the ground truth syncing */
isUnknown?: boolean;
};
type HintMap = Record<string, string[][]>;
type ArabicNormalizationOptions = {
normalizeAlef?: boolean;
normalizeHamza?: boolean;
normalizeYa?: boolean;
removeTatweel?: boolean;
};
type GenerateHintsOptions = {
boundaryStrategy?: 'none' | 'segment';
dedupe?: 'closed' | 'none';
maxN?: number;
minCount?: number;
minN?: number;
normalization?: ArabicNormalizationOptions;
stopwords?: string[];
topK?: number;
};
type Hints = {
map: HintMap;
normalization: Required<ArabicNormalizationOptions>;
};
type GeneratedHint = {
count: number;
firstOccurrenceIndex?: number;
length: number;
normalizedPhrase: string;
phrase: string;
topSurfaceForms?: string[];
};
/**
* Represents a segment during the marking and processing stage.
* Contains an array of tokens that may include segment break markers.
*/
type MarkedSegment = {
/**
* End time of the segment in seconds
*/
end: number;
/**
* Start time of the segment in seconds
*/
start: number;
/**
* Array of tokens and segment break markers that make up this segment
*/
tokens: MarkedToken[];
};
/**
* Represents either a token or a segment break marker.
* Used during the processing of text to identify natural break points.
*/
type MarkedToken = Token | AlwaysBreakMarker | SegmentBreakMarker;
type MarkTokensWithDividersOptions = {
fillers?: string[];
gapThreshold: number;
hints?: Hints;
};
type MarkAndCombineSegmentsOptions = MarkTokensWithDividersOptions & {
maxSecondsPerSegment: number;
minWordsPerSegment: number;
};
/**
* Represents a segment of text with timing information and optional word-level tokens.
* A segment is a higher-level structure that contains a sequence of related tokens.
*/
type Segment = Token & {
/**
* Word-by-word breakdown of the transcription with individual timings
*/
tokens: Token[];
};
/**
* Represents a single token (word or phrase) with timing information.
* This is the basic unit of transcribed text.
*/
type Token = {
/**
* End time in seconds.
*/
end: number;
/**
* Start time in seconds.
*/
start: number;
/**
* The transcribed text
*/
text: string;
};
//#endregion
//#region src/transcript.d.ts
/**
* Estimates a segment with word-level tokens from a single token with multi-word text.
* Splits the text by whitespace and calculates approximate timing for each word.
*
* @param {Token} param0 - The source token containing text with multiple words
* @param {number} param0.end - End time of the token in seconds
* @param {number} param0.start - Start time of the token in seconds
* @param {string} param0.text - The multi-word text content
* @returns {Segment} A segment with the original text and estimated word-level tokens
*/
declare const estimateSegmentFromToken: ({
end,
start,
text
}: Token) => Segment;
/**
* Marks tokens with segment dividers based on various criteria including:
* - Filler words (uh, umm, etc.)
* - Explicit multi-word hints
* - Significant time gaps between tokens
* - Punctuation at the end of tokens
*
* @param {Token[]} tokens - Array of tokens to process
* @param {Object} options - Configuration options
* @param {string[]} [options.fillers] - Optional array of filler words to mark as segment breaks
* @param {number} options.gapThreshold - Minimum time gap (in seconds) to consider a segment break
* @param {Hints} [options.hints] - Hints created with the createHints() function to indicate when to insert a new segment break.
* @returns {MarkedToken[]} Tokens with segment break markers inserted
*/
declare const markTokensWithDividers: (tokens: Token[], {
fillers,
gapThreshold,
hints
}: MarkTokensWithDividersOptions) => MarkedToken[];
/**
* Groups marked tokens into segments based on maximum segment duration.
* Creates segments from tokens, splitting when the duration exceeds the specified maximum.
*
* @param {MarkedToken[]} markedTokens - Array of tokens with segment break markers
* @param {number} maxSecondsPerSegment - Maximum duration (in seconds) for a segment
* @returns {MarkedSegment[]} Array of marked segments
*/
declare const groupMarkedTokensIntoSegments: (markedTokens: MarkedToken[], maxSecondsPerSegment: number) => MarkedSegment[];
/**
* Merges segments with fewer than the specified minimum words into the previous segment.
* This helps avoid very short segments that might break the flow of text.
*
* @param {MarkedSegment[]} segments - Array of marked segments to process
* @param {number} minWordsPerSegment - Minimum number of words required for a segment to stand alone
* @returns {MarkedSegment[]} Array of merged segments
*/
declare const mergeShortSegmentsWithPrevious: (segments: MarkedSegment[], minWordsPerSegment: number) => MarkedSegment[];
/**
* Formats segments into a timestamped transcript with timestamps at the beginning of each line.
* Lines are split based on segment breaks and maximum line duration.
*
* @param {MarkedSegment[]} segments - Array of marked segments to format
* @param {number} maxSecondsPerLine - Maximum duration (in seconds) for a single line
* @param {(buffer: Token) => string} [formatTokens] - Optional formatter that receives the buffered token range
* and returns the formatted line. When omitted the function emits timestamp-prefixed strings.
* @returns {string} Formatted transcript with timestamps
*/
declare const formatSegmentsToTimestampedTranscript: (segments: MarkedSegment[], maxSecondsPerLine: number, formatTokens?: (buffer: Token) => string) => string;
/**
* Maps marked segments into formatted segments with clean text representation.
* Combines the tokens into properly formatted text, respecting segment breaks
* and optional maximum line duration.
*
* @param {MarkedSegment[]} segments - Array of marked segments to format
* @param {number} [maxSecondsPerLine] - Optional maximum duration (in seconds) for a single line
* @returns {Segment[]} Array of formatted segments with clean text
*/
declare const mapSegmentsIntoFormattedSegments: (segments: MarkedSegment[], maxSecondsPerLine?: number) => Segment[];
/**
* Convenience function that processes segments through all steps:
* marking tokens with dividers, grouping into segments, and merging short segments.
*
* @param {Segment[]} segments - Array of input segments to process
* @param {Object} options - Configuration options
* @param {string[]} options.fillers - Array of filler words to mark as segment breaks
* @param {number} options.gapThreshold - Minimum time gap (in seconds) to consider a segment break
* @param {number} options.maxSecondsPerSegment - Maximum duration (in seconds) for a segment
* @param {number} options.minWordsPerSegment - Minimum number of words required for a segment to stand alone
* @returns {MarkedSegment[]} Array of processed and marked segments
*/
declare const markAndCombineSegments: (segments: Segment[], options: MarkAndCombineSegmentsOptions) => MarkedSegment[];
/**
* Cleans up marked tokens by removing unnecessary segment breaks that would
* cause individual tokens to appear on their own lines.
*
* @param {MarkedToken[]} markedTokens - The array of marked tokens to clean up
* @returns {MarkedToken[]} A new array with unnecessary breaks removed
*/
declare const cleanupIsolatedTokens: (markedTokens: MarkedToken[]) => MarkedToken[];
/**
* Aligns AI-generated tokens to a ground truth human-edited segment text.
*
* Uses Longest Common Subsequence (LCS) to identify anchor matches between
* tokenized output and ground truth. Where no matches exist, it interpolates
* timestamped tokens for unmatched words.
*
* @param segment - A `Segment` object with ground truth `text` and AI-generated `tokens`
* @param groundTruth - The ground truth text to apply to the segment's text and its tokens.
* @returns A new `GroundedSegment` with the `tokens` adjusted to match the ground truth `text`
* along with any unmatched tokens flagged.
*/
declare const updateSegmentWithGroundTruth: (segment: Segment, groundTruth: string) => GroundedSegment;
/**
* Produces a segment with the ground truth replacing the text and its respective tokens.
* @param segment The segment to replace the ground truth with.
* @param groundTruth The human verified transcription of the segment.
* @returns A segment with the ground truth applies to the segment text and its tokens.
*/
declare const applyGroundTruthToSegment: (segment: Segment, groundTruth: string) => Segment;
/**
* Merges multiple segments into a single segment.
*
* @param segments - Array of segments to merge into one
* @param delimiter - Optional string to join segment texts (defaults to space)
* @returns A single merged segment containing all tokens
*/
declare const mergeSegments: (segments: Segment[], delimiter?: string) => Segment;
/**
* Splits a segment at a specific time point into exactly two segments.
*
* This function does the opposite of mergeSegments, taking a single segment
* and dividing it into two segments at the specified split time.
*
* @param segment - The segment to split
* @param splitTime - The time (in seconds) at which to split the segment
* @returns An array containing exactly two segments
*/
declare const splitSegment: (segment: Segment, splitTime: number) => Segment[];
/**
* Searches through an array of tokens and returns the first one whose text sequence
* matches the given query string.
*
* This function will split the `query` into one or more hint phrases (via `createHints`),
* then scan `tokens` in order, returning the first token at which any hint sequence
* fully matches the subsequent tokens.
*
* @param tokens
* An ordered array of `Token` objects to search.
* @param query
* A string containing one or more words to match. If you pass multiple words
* (e.g. `"hello world"`), it will only match if `"hello"` at position `i` is
* immediately followed by `"world"` at position `i+1`.
* @returns
* The first `Token` in the array where the hint sequence matches, or `null`
* if no matching sequence is found.
*
* @example
* ```ts
* const tokens: Token[] = [
* { start: 0, end: 1, text: 'the' },
* { start: 1, end: 2, text: 'quick' },
* { start: 2, end: 3, text: 'brown' },
* { start: 3, end: 4, text: 'fox' },
* ];
*
* getFirstMatchingToken(tokens, 'quick brown');
* // → { start: 1, end: 2, text: 'quick' }
*
* getFirstMatchingToken(tokens, 'lazy dog');
* // → null
* ```
*/
declare const getFirstMatchingToken: (tokens: Token[], query: string) => null | Token;
/**
* Finds and returns the first token in a segment whose character‐range fully contains
* the given [selectionStart, selectionEnd) range.
*
* This is useful when you have a selection in the raw `segment.text` (for example, from
* an <input>’s `selectionStart` and `selectionEnd`) and you want to map that back to the
* corresponding timed `Token`.
*
* @param segment The Segment object containing the full `text` and an ordered list of `tokens`.
* @param selectionStart
* The zero‐based index into `segment.text` where the selection begins (inclusive).
* @param selectionEnd
* The zero‐based index into `segment.text` where the selection ends (exclusive).
* @returns
* The first `Token` whose span in `segment.text` covers the entire selection range or `null` if it is not found.
*
* @example
* ```ts
* const segment: Segment = {
* text: 'the fox and the rabbit',
* start: 0,
* end: 6,
* tokens: [
* { start: 0, end: 1, text: 'the' },
* { start: 2, end: 3, text: 'fox' },
* { start: 3, end: 4, text: 'and' },
* { start: 4, end: 5, text: 'the' },
* { start: 5, end: 6, text: 'rabbit' },
* ],
* };
*
* // Suppose the user selected the second "the" in an <input>,
* // which corresponds to characters 12–15 (exclusive end):
* const tok = getFirstTokenForSelection(segment, 12, 15);
* // tok === { start: 4, end: 5, text: 'the' }
* ```
*/
declare const getFirstTokenForSelection: (segment: Segment, selectionStart: number, selectionEnd: number) => null | Token;
//#endregion
//#region src/utils/hints.d.ts
/**
* Mine frequent n-grams from a token stream and return hint candidates sorted by frequency.
*
* This is Arabic-first: mining is performed on normalized token text.
* The returned `phrase` is the most common surface form observed for that normalized phrase.
*
* Breaking-change note: defaults favor Arabic ASR robustness (alef/ya normalization + tatweel stripping).
*/
declare const generateHintsFromTokens: (tokens: Token[], options?: GenerateHintsOptions) => GeneratedHint[];
/**
* Mine frequent n-grams from segments.
*
* If `boundaryStrategy` is `'segment'` (default), phrases cannot cross segment boundaries.
*/
declare const generateHintsFromSegments: (segments: Segment[], options?: GenerateHintsOptions) => GeneratedHint[];
//#endregion
//#region src/utils/textUtils.d.ts
/**
* Checks if a text string ends with sentence-ending punctuation.
* Supports: period (.), question mark (? / ؟), exclamation (!), Arabic semicolon (؛), ellipsis (…).
*
* @param {string} text - The text to check for ending punctuation
* @returns {boolean} True if the text ends with punctuation, false otherwise
*/
declare const isEndingWithPunctuation: (text: string) => boolean;
/**
* Formats seconds into a human-readable timestamp.
* For durations less than an hour: m:ss (e.g., "1:05")
* For durations an hour or longer: h:mm:ss (e.g., "1:02:05")
*
* @param {number} seconds - The time duration in seconds
* @returns {string} Formatted timestamp string
*/
declare const formatSecondsToTimestamp: (seconds: number) => string;
/**
* Strip leading/trailing punctuation/symbols, remove Arabic diacritics, NFC-normalize.
* Normalizes a word by removing diacritics and punctuation.
*
* This function:
* 1. Decomposes Unicode characters (NFD normalization)
* 2. Removes Arabic diacritics
* 3. Strips leading and trailing punctuation or symbols
* 4. Recomposes Unicode characters (NFC normalization)
*
* @param {string} w - The word to normalize
* @returns {string} The normalized word
*/
declare const normalizeWord: (w: string) => string;
/**
* Normalizes token text for Arabic-first matching and mining.
*
* This builds on `normalizeWord` (diacritics + trim punctuation) and adds optional
* Arabic-specific normalizations. Use the same normalization for:
* - mining repeated sequences
* - matching hints against tokens
*
* @param text The token text to normalize
* @param options Optional Arabic-specific normalizations
* @returns A normalized token string suitable for comparisons
*/
declare const normalizeTokenText: (text: string, options?: ArabicNormalizationOptions) => string;
/**
* Creates normalized hints for robust Arabic matching (diacritics/punctuation tolerant).
*
* Breaking change: hints are now normalized by default. This is intended for Arabic ASR.
*
* @param first Either the first hint string, or an options object overriding the default normalization.
* @param restHints Remaining hint strings, if the first argument was an options object.
* @returns A normalized hint map plus the normalization settings used for matching.
*/
declare const createHints: (first: ArabicNormalizationOptions | string, ...restHints: string[]) => Hints;
/**
* Tokenizes ground truth text properly, ensuring punctuation is attached to words
* rather than creating separate tokens.
* @param groundTruth The ground truth to tokenize.
* @returns The tokenized ground truth with the punctuations properly attached.
*/
declare const tokenizeGroundTruth: (groundTruth: string) => string[];
//#endregion
export { ArabicNormalizationOptions, GenerateHintsOptions, GeneratedHint, GroundedSegment, GroundedToken, HintMap, Hints, MarkAndCombineSegmentsOptions, MarkTokensWithDividersOptions, MarkedSegment, MarkedToken, Segment, Token, applyGroundTruthToSegment, cleanupIsolatedTokens, createHints, estimateSegmentFromToken, formatSecondsToTimestamp, formatSegmentsToTimestampedTranscript, generateHintsFromSegments, generateHintsFromTokens, getFirstMatchingToken, getFirstTokenForSelection, groupMarkedTokensIntoSegments, isEndingWithPunctuation, mapSegmentsIntoFormattedSegments, markAndCombineSegments, markTokensWithDividers, mergeSegments, mergeShortSegmentsWithPrevious, normalizeTokenText, normalizeWord, splitSegment, tokenizeGroundTruth, updateSegmentWithGroundTruth };
//# sourceMappingURL=index.d.mts.map