UNPKG

kokokor

Version:

A lightweight TypeScript library designed to reconstruct paragraphs from OCRed inputs.

github.com/ragaeeb/kokokor

ragaeeb/kokokor

1,113 lines (1,106 loc) • 56.4 kB

JavaScript

//#region src/utils/constants.ts /** * Conversion factor from points to inches. * In typography, there are 72 points in one inch. */ const PTS_TO_INCHES = 72; /** * Default options for poetry detection, providing a balanced starting point. * These values have been tuned to work well across various document types * while minimizing false positives and negatives in poetry identification. */ const DEFAULT_POETRY_OPTIONS = { centerToleranceRatio: .05, maxVerticalGapRatio: 2, minMarginRatio: .1, minWidthRatioForMerged: .6, minWordCount: 2, pairWidthSimilarityRatio: .4, pairWordCountSimilarityRatio: .5, wordDensityComparisonRatio: .95 }; /** * Default configuration options for mapping OCR observations to text lines. * These settings provide reasonable defaults for most document processing scenarios. */ const DEFAULT_OBSERVATIONS_TO_TEXT_LINES_OPTIONS = { centerToleranceRatio: .05, horizontalLines: [], isRTL: true, minMarginRatio: .2, pixelTolerance: 5, poetryDetectionOptions: DEFAULT_POETRY_OPTIONS, poetryPairDelimiter: " ", rectangles: [] }; /** * Maximum number of words expected in a typical prose line. * Lines exceeding this count may indicate formatting issues or merged content * that should be split during text processing. */ const MAX_PROSE_WORD_COUNT = 25; /** * A regex pattern to detect common prose punctuation. * This includes: * - Arabic and English commas (، ,) * - Arabic and English semicolons (؛ ;) * - Arabic and English question marks (؟ ?) * - Arabic and English periods/full stops (۔ .) * - Colons (:) * - Parentheses (()) */ const PROSE_PUNCTUATION_PATTERN = /[،,؛;؟?۔.:()]/; /** * Percentile used as the robust reference width for paragraph grouping. */ const PARAGRAPH_WIDTH_PERCENTILE = .75; /** * Percentile used for deriving the right-edge x baseline from candidate lines. */ const PARAGRAPH_BASELINE_PERCENTILE = .25; /** * Ratio of reference width used to detect right-edge indentation. * This is the coarse page-scale component of indentation detection and is * combined with a line-height floor for DPI resilience. */ const PARAGRAPH_INDENT_THRESHOLD_RATIO = .04; /** * Minimum pixel distance required before classifying a line as indented. * * This value is calibrated around 72 DPI coordinate spaces: * - 3 px @ 72 DPI ~= 0.042 in (~1.06 mm) * It acts only as a hard floor for tiny/noisy line-height inputs. */ const PARAGRAPH_MIN_INDENT_PX = 3; /** * Minimum indent floor expressed as a ratio of typical line height. * Helps keep indentation thresholds stable across different coordinate scales. */ const PARAGRAPH_MIN_INDENT_HEIGHT_RATIO = .15; /** * Minimum width ratio for lines that can participate in indentation checks. */ const PARAGRAPH_MIN_INDENT_CANDIDATE_WIDTH_RATIO = .7; //#endregion //#region src/utils/grouping.ts /** * Groups items by their assigned index value into separate arrays. * * This function takes an array of indexed items and organizes them into subarrays * based on their index property, which typically represents lines, paragraphs, or * other logical groupings. The index property is removed from the resulting items. * * @template T - Type extending an object with a numeric index property * @param items - Array of items with index properties to be grouped * @returns An array of item groups, where each group contains items with the same index * * @example * ```typescript * const items = [ * { text: "Hello", index: 0 }, * { text: "World", index: 0 }, * { text: "Goodbye", index: 1 } * ]; * const groups = groupByIndex(items); * // Result: [ * // [{ text: "Hello" }, { text: "World" }], * // [{ text: "Goodbye" }] * // ] * ``` */ const groupByIndex = (items) => { const groups = []; for (const { index, ...item } of items) { if (!groups[index]) groups[index] = []; groups[index].push(item); } return groups; }; /** * Sorts items within each group horizontally by their x-coordinate. * * This ensures proper reading order (left-to-right for LTR languages) for items * within the same line or group. The function creates a copy of the input array * to avoid modifying the original data structure. * * @template T - Type extending an object with a bbox containing x-coordinate * @param grouped - Array of item groups to be sorted horizontally * @returns A new array with the same structure but with items sorted by x-coordinate within each group * * @example * ```typescript * const groups = [ * [{ bbox: { x: 100 }, text: "World" }, { bbox: { x: 50 }, text: "Hello" }] * ]; * const sorted = sortGroupsHorizontally(groups); * // Result: [[{ bbox: { x: 50 }, text: "Hello" }, { bbox: { x: 100 }, text: "World" }]] * ``` */ const sortGroupsHorizontally = (grouped) => { const groups = grouped.slice(); for (let i = 0; i < groups.length; i++) groups[i] = groups[i].toSorted((a, b) => a.bbox.x - b.bbox.x); return groups; }; /** * Merges the group of observations into a single one. * @param group The group of observations to merge. * @param delimiter Text delimiter used when concatenating observations. * @returns A single observation with the text of the group concatenated as well as the bounding box adjusted to fit all of the contents. */ const mergeObservations = (group, delimiter = " ") => { let minX = group[0].bbox.x; let minY = group[0].bbox.y; let maxX = group[0].bbox.x + group[0].bbox.width; let maxY = group[0].bbox.y + group[0].bbox.height; let combinedText = group[0].text; for (let i = 1; i < group.length; i++) { const { bbox, text } = group[i]; minX = Math.min(minX, bbox.x); minY = Math.min(minY, bbox.y); maxX = Math.max(maxX, bbox.x + bbox.width); maxY = Math.max(maxY, bbox.y + bbox.height); combinedText += `${delimiter}${text}`; } return { ...group[0], bbox: { height: maxY - minY, width: maxX - minX, x: minX, y: minY }, text: combinedText }; }; /** * Merges multiple observations within each group into a single combined observation. * * For each group, this function performs the following operations: * 1. Calculates a combined bounding box that encompasses all observations in the group * 2. Concatenates the text content of all observations with spaces between them * 3. Preserves all additional properties from the first observation in the group * * This is typically used to combine individual word-level OCR results into complete * lines or to merge line segments into full paragraphs. * * @template T - Type extending Observation (must have bbox and text properties) * @param grouped - Array of observation groups to be merged * @returns An array of merged observations, where each represents a complete line or paragraph * * @example * ```typescript * const groups = [ * [ * { bbox: { x: 0, y: 0, width: 50, height: 20 }, text: "Hello" }, * { bbox: { x: 60, y: 0, width: 50, height: 20 }, text: "world" } * ] * ]; * const merged = mergeGroupedObservations(groups); * // Result: [{ * // bbox: { x: 0, y: 0, width: 110, height: 20 }, * // text: "Hello world" * // }] * ``` */ const mergeGroupedObservations = (grouped) => { const result = []; for (const group of grouped) { if (group.length === 1) { result.push(group[0]); continue; } result.push(mergeObservations(group)); } return result; }; //#endregion //#region src/utils/layout.ts /** * Determines if an observation is centered on the page with sufficient whitespace around it. * * An observation is considered centered if: * 1. Its center point is within tolerance of the page center * 2. It has sufficient margins (whitespace) on both left and right sides * * This prevents false positives where wide observations span most of the page * but happen to have their center point near the page center. * * @param bbox - The bounding box to check for centering * @param imageWidth - The total width of the page/image in pixels * @param options - Configuration options for centering criteria * @param options.centerToleranceRatio - The tolerance for center point alignment as a ratio of image width (default: 0.05 = 5%) * @param options.minMarginRatio - The minimum margin required on each side as a ratio of image width (default: 0.1 = 10%) * @returns True if the observation is centered with sufficient whitespace, false otherwise * * @example * ```typescript * // Using default options * isObservationCentered({ width: 286, x: 298 }, 960, { centerToleranceRatio: 0.05, minMarginRatio: 0.1 }) // true * * // Using custom options for stricter centering * isObservationCentered( * { width: 286, x: 298 }, * 960, * { centerToleranceRatio: 0.02, minMarginRatio: 0.15 } * ) * * // A wide observation spanning most of the page - should return false * isObservationCentered({ width: 2026, x: 232 }, 2481, { centerToleranceRatio: 0.05, minMarginRatio: 0.1 }) // false * ``` */ const isObservationCentered = (bbox, imageWidth, options) => { const pageCenter = imageWidth / 2; const tolPx = imageWidth * options.centerToleranceRatio; const centerX = bbox.x + bbox.width / 2; const isCenterPointCentered = Math.abs(centerX - pageCenter) <= tolPx; const leftMargin = bbox.x; const rightMargin = imageWidth - (bbox.x + bbox.width); const minMargin = imageWidth * options.minMarginRatio; return isCenterPointCentered && leftMargin >= minMargin && rightMargin >= minMargin; }; /** * Filters out horizontal lines that are contained within any of the provided rectangles. * * This is useful for removing header/footer lines that appear within document sections * while preserving lines that mark true document boundaries or section separators. * * @param rectangles - Array of rectangles to check containment against * @param horizontalLines - Array of horizontal lines to filter * @param tolerance - Pixel tolerance for boundary checking (default: 5) * @returns Array of horizontal lines that are NOT contained within any rectangle */ const filterHorizontalLinesOutsideRectangles = (rectangles, horizontalLines, tolerance = 5) => { return horizontalLines.filter((line) => { return !rectangles.some((rect) => { return isBoundingBoxContained(line, rect, tolerance); }); }); }; /** * Finds the y-coordinate of the last horizontal line that's not contained within any rectangle. * * Used to identify the footer boundary - text below this line is typically footnotes. * Filters out horizontal lines that are contained within rectangles before finding the last one. * * @param rectangles - Array of rectangles to exclude horizontal lines from * @param horizontalLines - Array of horizontal lines to consider * @param pixelTolerance - Pixel tolerance for containment checking (default: 5) * @returns Y-coordinate of the last qualifying horizontal line, or undefined if none found */ const getLastHorizontalLineY = (rectangles, horizontalLines, pixelTolerance = 5) => { if (rectangles.length > 0 && horizontalLines.length > 0) horizontalLines = filterHorizontalLinesOutsideRectangles(rectangles, horizontalLines, pixelTolerance); horizontalLines = horizontalLines.filter((line) => line.y > pixelTolerance); return horizontalLines.at(-1)?.y; }; /** * Checks if a bounding box is contained within another bounding box with tolerance. * * The tolerance extends the outer bounding box in all directions, making containment * checking more lenient for cases where elements might be slightly outside due to * OCR inaccuracies or minor positioning variations. * * @param inner - The bounding box to check if it's inside * @param outer - The bounding box to check if it contains the inner box * @param tolerance - The pixel tolerance for boundary checking (extends outer box boundaries) * @returns True if the inner bounding box is contained within the outer bounding box (with tolerance) */ const isBoundingBoxContained = (inner, outer, tolerance) => { const outerLeft = outer.x - tolerance; const outerRight = outer.x + outer.width + tolerance; const outerTop = outer.y - tolerance; const outerBottom = outer.y + outer.height + tolerance; const innerLeft = inner.x; const innerRight = inner.x + inner.width; const innerTop = inner.y; const innerBottom = inner.y + inner.height; return innerLeft >= outerLeft && innerRight <= outerRight && innerTop >= outerTop && innerBottom <= outerBottom; }; /** * Converts bounding box coordinates from array format to object format. * Transforms [x1, y1, x2, y2] coordinates to {x, y, width, height} format. * * @param box - Array containing [x1, y1, x2, y2] coordinates where (x1,y1) is top-left and (x2,y2) is bottom-right * @returns Bounding box object with x, y, width, and height properties */ const mapMatrixToBoundingBox = (box) => { const [x1, y1, x2, y2] = box; return { height: y2 - y1, width: x2 - x1, x: x1, y: y1 }; }; /** * Analyzes the typical line spacing in the document to determine * what constitutes a normal gap vs. an intra-line gap. * * This analysis helps distinguish between text that belongs on the same logical line * but was split by OCR, versus text that represents separate lines. The function * calculates percentiles of vertical gaps to establish thresholds. * * @param sortedItems - Array of observations sorted by y-coordinate (top to bottom) * @returns Object containing typical gap size and minimum intra-line gap threshold * @returns returns.typicalGap - The 75th percentile gap size, representing normal line spacing * @returns returns.minIntraLineGap - Threshold below which gaps are considered intra-line */ const analyzeLineSpacing = (sortedItems) => { const len = sortedItems.length; if (len < 3) return { minIntraLineGap: 0, typicalGap: 0 }; const gaps = new Array(len - 1); for (let i = 1; i < len; i++) gaps[i - 1] = sortedItems[i].bbox.y - sortedItems[i - 1].bbox.y; gaps.sort((a, b) => a - b); const medianIdx = Math.floor(gaps.length * .5); const p75Idx = Math.floor(gaps.length * .75); const medianGap = gaps[medianIdx]; const typicalGap = gaps[p75Idx]; return { minIntraLineGap: Math.min(medianGap * .6, typicalGap * .4), typicalGap }; }; /** * Computes an adaptive line height factor based on item heights and spacing patterns. * * The line height factor is used to determine how much vertical space to allow when * grouping text observations into lines. A smaller factor groups items more aggressively, * while a larger factor is more conservative about grouping. * * @param heights - Array of heights from bounding box properties * @param typicalGap - Typical vertical gap between lines in the document (from analyzeLineSpacing) * @returns Adaptive line height factor between 0.15 and 0.4 * - 0.15: Small gaps relative to text height (likely intra-line groupings) * - 0.25: Medium gaps (standard line spacing) * - 0.4: Large gaps (widely spaced separate lines) */ const computeAdaptiveLineHeightFactor = (heights, typicalGap) => { if (heights.length === 0) return .3; let totalHeight = 0; for (const height of heights) totalHeight += height; const gapToHeightRatio = typicalGap / (totalHeight / heights.length); if (gapToHeightRatio < .8) return .15; if (gapToHeightRatio < 1.2) return .25; return .4; }; //#endregion //#region src/utils/marking.ts /** * Minimum number of left-edge candidates required before list-start heuristics activate. */ const LIST_START_MIN_CANDIDATES = 3; /** * Minimum relative vertical gap between consecutive list starts. */ const LIST_START_GAP_HEIGHT_FACTOR = .9; /** * Smaller indentation threshold used for repeated list-start lines. */ const LIST_START_INDENT_THRESHOLD_RATIO = .03; /** * Low percentile for detecting a stable left-edge baseline for list starts. */ const LIST_START_BASELINE_PERCENTILE = .1; /** * Number of short indented continuation lines needed to confirm list topology. */ const LIST_START_MIN_SHORT_INDENTED_LINES = 2; /** * Determines if two consecutive items should be placed on separate lines based on spacing analysis. * * This function uses multiple criteria to determine line breaks: * - Primary threshold based on average height and line height factor * - Secondary check using document-wide spacing patterns * - DPI-adjusted tolerance for consistent behavior across different resolutions * * @template T - Type extending an object with a bounding box * @param prev - Previous item in the sequence * @param current - Current item being evaluated * @param effectiveFactor - Line height factor multiplier for threshold calculation * @param effectiveYTolerance - DPI-adjusted vertical tolerance in pixels * @param spacingAnalysis - Document spacing analysis containing gap measurements * @returns True if items should be placed on separate lines, false otherwise */ const shouldSeparateLines = (prev, current, effectiveFactor, effectiveYTolerance, spacingAnalysis) => { const dy = current.bbox.y - prev.bbox.y; const avgHeight = (prev.bbox.height + current.bbox.height) * .5; let shouldSeparate = dy > avgHeight * effectiveFactor + effectiveYTolerance; if (!shouldSeparate && spacingAnalysis.minIntraLineGap > 0 && dy > spacingAnalysis.minIntraLineGap) shouldSeparate = dy > Math.min(avgHeight * .2, spacingAnalysis.minIntraLineGap); return shouldSeparate; }; /** * Processes sorted items and assigns line indices based on vertical spacing. * * This function iterates through vertically sorted items and assigns line numbers * based on spacing analysis. Items that are close enough vertically are assigned * to the same line, while items with significant vertical gaps start new lines. * * @template T - Type extending an object with a bounding box * @param sortedItems - Array of items sorted by y-coordinate (top to bottom) * @param effectiveFactor - Line height factor to use for threshold calculations * @param effectiveYTolerance - DPI-adjusted vertical tolerance in pixels * @param spacingAnalysis - Document spacing analysis results * @returns Array of items with assigned line index properties */ const assignLineIndices = (sortedItems, effectiveFactor, effectiveYTolerance, spacingAnalysis) => { const len = sortedItems.length; const marked = new Array(len); let currentLine = 0; let prev = sortedItems[0]; marked[0] = { ...prev, index: currentLine }; for (let i = 1; i < len; i++) { const item = sortedItems[i]; if (shouldSeparateLines(prev, item, effectiveFactor, effectiveYTolerance, spacingAnalysis)) currentLine += 1; marked[i] = { ...item, index: currentLine }; prev = item; } return marked; }; /** * Groups items into lines based on vertical proximity and document spacing patterns. * * This function implements an adaptive line detection algorithm that analyzes the document's * spacing patterns to distinguish between separate lines and text elements that belong on * the same line. The algorithm: * * 1. Sorts items by y-coordinate (top to bottom) * 2. Analyzes document-wide spacing patterns (unless lineHeightFactor is provided) * 3. Computes adaptive thresholds based on item heights and spacing analysis * 4. Assigns line indices based on vertical proximity * 5. Returns items sorted by line index, then by y-coordinate * * Two items are considered to be on the same line if the vertical distance between them * is less than a dynamically computed threshold based on: * - Average height of the items * - Adaptive line height factor (computed from document patterns or provided) * - DPI-adjusted pixel tolerance * - Document-wide spacing analysis * * @template T - Type extending an object with a bounding box * @param items - Array of items to be grouped into lines * @param dpi - Document DPI (dots per inch) for scaling tolerance values appropriately * @param pixelTolerance - Additional vertical tolerance in pixels at 72 DPI * @param lineHeightFactor - Optional fixed line height factor. If not provided, computed adaptively from document patterns * @returns Array of items with index properties indicating line assignments, sorted by line then y-coordinate * * @example * ```typescript * const observations = [ * { bbox: { x: 0, y: 0, width: 100, height: 20 }, text: "First line" }, * { bbox: { x: 0, y: 25, width: 100, height: 20 }, text: "Second line" } * ]; * const lines = indexItemsAsLines(observations, 300, 5); * // Result: Items with index: 0 for first line, index: 1 for second line * ``` */ const indexItemsAsLines = (items, dpi, pixelTolerance, lineHeightFactor) => { const byY = items.toSorted((a, b) => a.bbox.y - b.bbox.y); const effectiveYTolerance = pixelTolerance * (dpi / PTS_TO_INCHES); const spacingAnalysis = lineHeightFactor ? { minIntraLineGap: 0, typicalGap: 0 } : analyzeLineSpacing(byY); return assignLineIndices(byY, lineHeightFactor || computeAdaptiveLineHeightFactor(items.map((i) => i.bbox.height), spacingAnalysis.typicalGap), effectiveYTolerance, spacingAnalysis).toSorted((a, b) => a.index !== b.index ? a.index - b.index : a.bbox.y - b.bbox.y); }; /** * Calculates the DPI (dots per inch) based on image dimensions and original PDF size. * * This utility function helps determine the resolution at which a PDF was rasterized * by comparing the resulting image dimensions with the original PDF page dimensions. * The DPI values are essential for proper scaling of pixel-based tolerances and * measurements throughout the document processing pipeline. * * @param imageSize - Dimensions of the rasterized image in pixels * @param pdfSize - Original dimensions of the PDF page in points (1/72 inch) * @returns Object containing x and y DPI values * * @example * ```typescript * const imageSize = { width: 2480, height: 3508 }; * const pdfSize = { width: 595, height: 842 }; // A4 page in points * const dpi = calculateDPI(imageSize, pdfSize); * // Result: { x: 300, y: 300 } for a 300 DPI scan * ``` */ const calculateDPI = (imageSize, pdfSize) => { return { x: imageSize.width / (pdfSize.width / PTS_TO_INCHES), y: imageSize.height / (pdfSize.height / PTS_TO_INCHES) }; }; /** * Returns a percentile value from a sorted numeric array. */ const pickPercentile = (sortedValues, percentile) => { return sortedValues[Math.min(sortedValues.length - 1, Math.max(0, Math.floor((sortedValues.length - 1) * percentile)))]; }; /** * Returns true when line start is indented relative to baseline. */ const isIndentedLine = (item, baselineX, indentThreshold) => item.bbox.x - baselineX > indentThreshold; /** * Returns true when a list-start candidate line is near the start baseline and sufficiently wide. */ const isListStartCandidate = (item, baselineX, indentThreshold, minWidth) => !isIndentedLine(item, baselineX, indentThreshold) && item.bbox.width >= minWidth; const computeReferenceWidth = (items) => { const widths = items.map((item) => item.bbox.width).toSorted((a, b) => a - b); return widths.length >= 4 ? pickPercentile(widths, PARAGRAPH_WIDTH_PERCENTILE) : widths[widths.length - 1]; }; const computeBaselineX = (items, minIndentCandidateWidth) => { const baselineCandidates = items.filter((item) => item.bbox.width >= minIndentCandidateWidth).map((item) => item.bbox.x).toSorted((a, b) => a - b); const allX = items.map((item) => item.bbox.x).toSorted((a, b) => a - b); return { allX, baselineX: pickPercentile(baselineCandidates.length > 0 ? baselineCandidates : allX, PARAGRAPH_BASELINE_PERCENTILE) }; }; const computeIndentFloor = (items) => { const typicalLineHeight = pickPercentile(items.map((item) => item.bbox.height).toSorted((a, b) => a - b), .5); return Math.max(PARAGRAPH_MIN_INDENT_PX, typicalLineHeight * PARAGRAPH_MIN_INDENT_HEIGHT_RATIO); }; /** * Detects repeated list-start geometry (e.g., numbered footnote items) without * depending on semantic markers such as `isFootnote` or regex prefixes. * * The signal activates only when we observe: * - multiple near-baseline list-start candidates, * - short indented continuation lines, * - at least one bridge pattern (start -> continuation -> start), * - and short lines present in the block. */ const shouldUseListStartSignal = (items, thresholdWidth, minIndentCandidateWidth, listStartBaselineX, listStartIndentThreshold) => { const listStartCandidateCount = items.filter((item) => isListStartCandidate(item, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth)).length; const shortIndentedLineCount = items.filter((item) => item.bbox.width < minIndentCandidateWidth && isIndentedLine(item, listStartBaselineX, listStartIndentThreshold)).length; const hasListBridge = items.some((item, i) => { if (i === 0 || i === items.length - 1) return false; const prev = items[i - 1]; const next = items[i + 1]; return item.bbox.width < minIndentCandidateWidth && isIndentedLine(item, listStartBaselineX, listStartIndentThreshold) && isListStartCandidate(prev, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth) && isListStartCandidate(next, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth); }); return listStartCandidateCount >= LIST_START_MIN_CANDIDATES && shortIndentedLineCount >= LIST_START_MIN_SHORT_INDENTED_LINES && hasListBridge && items.some((item) => item.bbox.width < thresholdWidth); }; const buildParagraphMetrics = (items, widthTolerance) => { const referenceWidth = computeReferenceWidth(items); const thresholdWidth = referenceWidth * widthTolerance; const minIndentCandidateWidth = thresholdWidth * PARAGRAPH_MIN_INDENT_CANDIDATE_WIDTH_RATIO; const { allX, baselineX } = computeBaselineX(items, minIndentCandidateWidth); const listStartBaselineX = pickPercentile(allX, LIST_START_BASELINE_PERCENTILE); const indentFloor = computeIndentFloor(items); const indentThreshold = Math.max(referenceWidth * PARAGRAPH_INDENT_THRESHOLD_RATIO, indentFloor); const listStartIndentThreshold = Math.max(referenceWidth * LIST_START_INDENT_THRESHOLD_RATIO, indentFloor); return { baselineX, indentThreshold, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth, shouldUseListStartSignal: shouldUseListStartSignal(items, thresholdWidth, minIndentCandidateWidth, listStartBaselineX, listStartIndentThreshold), thresholdWidth }; }; const hasVerticalBreakSignal = (items, index, thresholdWidth, verticalJumpFactor) => { if (index === 0) return false; const item = items[index]; const prev = items[index - 1]; if (index === 1) { if (prev.bbox.width < thresholdWidth) return false; return item.bbox.y - prev.bbox.y > prev.bbox.height * verticalJumpFactor; } const prevPrev = items[index - 2]; if (prev.bbox.width < thresholdWidth || prevPrev.bbox.width < thresholdWidth) return false; const gap = item.bbox.y - prev.bbox.y; const prevGap = prev.bbox.y - prevPrev.bbox.y; if (prevGap > 0) return gap > prevGap * verticalJumpFactor; return prevGap === 0 && gap > 0 && gap > item.bbox.height * .5 * verticalJumpFactor; }; const hasIndentBreakSignal = (items, index, metrics) => { if (index === 0) return false; const item = items[index]; const prev = items[index - 1]; const isCurrentIndented = isIndentedLine(item, metrics.baselineX, metrics.indentThreshold); const wasPrevShort = prev.bbox.width < metrics.thresholdWidth; if (!isCurrentIndented || wasPrevShort || item.bbox.width < metrics.minIndentCandidateWidth) return false; return !isIndentedLine(prev, metrics.baselineX, metrics.indentThreshold); }; const hasListStartBreakSignal = (items, index, metrics) => { if (index === 0 || !metrics.shouldUseListStartSignal) return false; const item = items[index]; const prev = items[index - 1]; const isCurrentListStart = isListStartCandidate(item, metrics.listStartBaselineX, metrics.listStartIndentThreshold, metrics.minIndentCandidateWidth); const isPrevListStart = isListStartCandidate(prev, metrics.listStartBaselineX, metrics.listStartIndentThreshold, metrics.minIndentCandidateWidth); const wasPrevShort = prev.bbox.width < metrics.thresholdWidth; const gap = item.bbox.y - prev.bbox.y; const minGapForListStart = Math.min(prev.bbox.height, item.bbox.height) * LIST_START_GAP_HEIGHT_FACTOR; return isCurrentListStart && isPrevListStart && !wasPrevShort && gap >= minGapForListStart; }; const resolveBreakReason = (items, index, verticalJumpFactor, metrics) => { if (index === 0) return null; if (hasVerticalBreakSignal(items, index, metrics.thresholdWidth, verticalJumpFactor)) return "vertical"; if (hasIndentBreakSignal(items, index, metrics)) return "indent"; if (hasListStartBreakSignal(items, index, metrics)) return "list-start"; return null; }; const shouldAdvanceAfterShortLine = (item, index, breakReason, thresholdWidth) => { if (item.bbox.width >= thresholdWidth) return false; return index === 0 || breakReason !== "indent"; }; /** * Groups items into paragraphs based on vertical spacing patterns and line width analysis. * * This function analyzes vertical spacing between consecutive items and their widths to * identify paragraph boundaries. The algorithm uses four coordinated signals: * * 1. **Vertical jump detection**: A new paragraph starts when there's a significant * increase in vertical gap compared to previous gaps, but only when both preceding * lines are "full-width" (not short lines that might indicate natural breaks) * * 2. **Indent-start detection**: A line that newly indents from the right-edge baseline * starts a new paragraph. * * 3. **List-start detection**: Repeated left-edge starts with short indented continuations * are treated as separate list items. * * 4. **Short line detection**: Lines significantly narrower than a robust reference width * are considered paragraph-ending lines, causing the next line to start a new paragraph. * * These heuristics work together to handle various paragraph patterns including: * - Standard paragraphs with consistent spacing * - Consistently indented paragraph starts * - Repeated list-start structures (including footnote-style note lists) * - Paragraphs ending with short lines * - Headers and subheadings with extra spacing * - Footer content separated by spacing * * @template T - Type extending an object with a bounding box * @param items - Array of items (typically lines) to be grouped into paragraphs * @param verticalJumpFactor - Multiplier determining how much larger a gap needs to be to indicate a paragraph break (e.g., 2.0 means 200% larger) * @param widthTolerance - Fraction of reference width below which a line is considered "short" (0-1, e.g., 0.8 means 80% of reference width) * @returns Array of items with index properties indicating paragraph assignments, sorted by paragraph then y-coordinate * * @example * ```typescript * const lines = [ * { bbox: { y: 0, width: 400, height: 20 }, text: "First paragraph line" }, * { bbox: { y: 25, width: 300, height: 20 }, text: "Short line ending" }, // Short line * { bbox: { y: 55, width: 400, height: 20 }, text: "Second paragraph" } // Gap + new para * ]; * const paragraphs = indexItemsAsParagraphs(lines, 2.0, 0.8); * // Result: First two lines index: 0, third line index: 1 * ``` */ const indexItemsAsParagraphs = (items, verticalJumpFactor, widthTolerance) => { if (items.length === 0) return []; const metrics = buildParagraphMetrics(items, widthTolerance); const out = []; let index = 0; for (let i = 0; i < items.length; i++) { const item = items[i]; const breakReason = resolveBreakReason(items, i, verticalJumpFactor, metrics); if (breakReason !== null) index++; out.push({ ...item, index }); if (shouldAdvanceAfterShortLine(item, i, breakReason, metrics.thresholdWidth)) index++; } return out.sort((a, b) => a.index !== b.index ? a.index - b.index : a.bbox.y - b.bbox.y); }; //#endregion //#region src/utils/normalization.ts /** * Adjusts x-coordinates of observations for right-to-left (RTL) text processing. * * This function transforms the coordinate system to accommodate right-to-left text * direction, which is essential for languages like Arabic, Hebrew, Farsi, and others. * It flips the x-coordinate so that the rightmost edge becomes the origin (x=0), * enabling proper text flow analysis for RTL scripts. * * The transformation formula: `newX = imageWidth - originalX - textWidth` * * @param observations - Array of text observations with bounding box data from OCR * @param imageWidth - Total width of the document/image in pixels * @returns A new array of observations with x-coordinates adjusted for RTL text processing * * @example * ```typescript * const observations = [ * { bbox: { x: 100, y: 0, width: 50, height: 20 }, text: "مرحبا" } * ]; * const rtlObservations = mapOcrResultToRTLObservations(observations, 800); * // Result: { bbox: { x: 650, y: 0, width: 50, height: 20 }, text: "مرحبا" } * // Original x: 100, becomes: 800 - 100 - 50 = 650 * ``` */ const mapOcrResultToRTLObservations = (observations, imageWidth) => { return observations.map((o) => ({ ...o, bbox: { ...o.bbox, x: imageWidth - o.bbox.x - o.bbox.width } })); }; /** * Filters out noisy or invalid observations based on text content quality. * * This function removes observations that are likely to be OCR noise or artifacts * by checking if the text content meets minimum quality criteria. Currently filters * out observations with text shorter than 2 characters, which often represent * punctuation marks, single characters, or OCR errors that don't contribute * meaningful content to document analysis. * * @param o - Single observation to evaluate for noise filtering * @returns True if the observation should be kept, false if it should be filtered out * * @example * ```typescript * const observations = [ * { bbox: {...}, text: "Hello world" }, // Kept: length > 1 * { bbox: {...}, text: "." }, // Filtered: length = 1 * { bbox: {...}, text: "" } // Filtered: length = 0 * ]; * const clean = observations.filter(filterNoisyObservations); * // Result: Only "Hello world" observation remains * ``` */ const filterNoisyObservations = (o) => o.text?.replace(/[،,؛;؟?۔.:\-()]/g, "").length > 1; /** * Normalizes x-coordinates of observations to create clean alignment. * * This function identifies observations that are approximately aligned to the leftmost * position and standardizes their x-coordinates to create visually consistent, properly * aligned text blocks. This is particularly useful for correcting minor OCR alignment * inconsistencies that can occur due to image quality, skew, or OCR engine variations. * * The alignment threshold is calculated proportionally to the DPI ratio to ensure * consistent behavior across different document resolutions. Observations within * the threshold distance from the leftmost position are snapped to that position. * * @param observations - Array of text observations to normalize for alignment * @param dpi - The dots per inch of the source document used for threshold calculation * @param standardDPI - The standard DPI to normalize against for consistent thresholds (default: 300) * @returns A new array of observations with normalized x-coordinates for improved alignment * * @example * ```typescript * const observations = [ * { bbox: { x: 50, y: 0, width: 100, height: 20 }, text: "Line 1" }, * { bbox: { x: 52, y: 25, width: 100, height: 20 }, text: "Line 2" }, // Slightly off * { bbox: { x: 100, y: 50, width: 100, height: 20 }, text: "Indented" } * ]; * const normalized = normalizeObservationsX(observations, 300); * // Result: First two lines aligned to x: 50, third line unchanged at x: 100 * ``` */ const normalizeObservationsX = (observations, dpi, standardDPI = 300) => { const thresholdPx = standardDPI / dpi * 5; const minX = Math.min(...observations.map((o) => o.bbox.x)); return observations.map((o) => { if (Math.abs(o.bbox.x - minX) <= thresholdPx) return { ...o, bbox: { ...o.bbox, x: minX } }; return o; }); }; /** * Simplifies an observation for debugging purposes by reducing precision and content. * * This utility function creates a simplified version of an observation that's easier * to read in debug output or logs. It performs two main simplifications: * * 1. **Coordinate precision**: Truncates floating-point coordinates to integers * 2. **Text content**: Filters to words longer than 1 character and keeps only the first word * * This is particularly useful when debugging large datasets where full observation * details would be overwhelming, but you need to understand the general structure * and positioning of text elements. * * @param observation - The observation to simplify for debugging output * @returns A simplified observation with truncated coordinates and reduced text content * * @example * ```typescript * const observation = { * bbox: { x: 123.456, y: 78.901, width: 234.567, height: 19.123 }, * text: "Hello world from OCR engine" * }; * const simplified = simplifyObservation(observation); * // Result: { * // bbox: { x: 123, y: 78, width: 234, height: 19 }, * // text: "Hello" * // } * ``` */ const simplifyObservations = (observations, truncateText = false) => { return observations.map((observation) => { return { bbox: { height: Math.trunc(observation.bbox.height), width: Math.trunc(observation.bbox.width), x: Math.trunc(observation.bbox.x), y: Math.trunc(observation.bbox.y) }, text: truncateText ? observation.text.split(" ").filter((word) => word.length > 1).slice(0, 1).join(" ") : observation.text }; }); }; //#endregion //#region src/utils/options.ts /** * Resolves optional overrides against a default options object. * * This is a shallow merge helper intended for option bags where * top-level keys are merged and nested objects are handled explicitly * by the caller when needed. */ const resolveWithDefaults = (defaults, overrides) => ({ ...defaults, ...overrides ? Object.fromEntries(Object.entries(overrides).filter(([, value]) => value !== void 0)) : {} }); //#endregion //#region src/utils/poetry.ts const DEFAULT_CENTER_TOLERANCE = DEFAULT_POETRY_OPTIONS.centerToleranceRatio ?? .05; const DEFAULT_MIN_MARGIN = DEFAULT_POETRY_OPTIONS.minMarginRatio ?? .1; const DEFAULT_MIN_WIDTH_RATIO_FOR_MERGED = DEFAULT_POETRY_OPTIONS.minWidthRatioForMerged ?? .6; const DEFAULT_MIN_WORD_COUNT = DEFAULT_POETRY_OPTIONS.minWordCount ?? 2; const DEFAULT_PAIR_WIDTH_SIMILARITY = DEFAULT_POETRY_OPTIONS.pairWidthSimilarityRatio ?? .4; const DEFAULT_PAIR_WORD_SIMILARITY = DEFAULT_POETRY_OPTIONS.pairWordCountSimilarityRatio ?? .5; const DEFAULT_DENSITY_RATIO = DEFAULT_POETRY_OPTIONS.wordDensityComparisonRatio ?? .95; const DEFAULT_MAX_VERTICAL_GAP_RATIO = DEFAULT_POETRY_OPTIONS.maxVerticalGapRatio ?? 2; const NBSP_PATTERN = /\u00A0/g; const TATWEEL_PATTERN = /\u0640/g; const NON_WHITESPACE_PATTERN = /\S+/g; const STRIP_PUNCTUATION_SYMBOLS_AND_SPACE_PATTERN = /[\p{P}\p{S}\s]+/gu; const ARABIC_OR_LATIN_DIGITS_PATTERN = /^[\d\u0660-\u0669]+$/; /** * Calculates the average word density (words per pixel) for prose text in the document. * * Filters observations to identify likely prose content by excluding centered text, * very narrow text, and text with too few or too many words. Used as a baseline * for poetry detection algorithms that rely on comparing word density patterns. * * Prose text typically has higher word density than poetry because prose lines * extend closer to page margins and contain more words per line. * * @param observations - Array of text observations to analyze * @param imageWidth - Total width of the document/image in pixels * @param options - Configuration options for prose identification * @param options.centerToleranceRatio - Tolerance for identifying centered text to exclude * @param options.minMarginRatio - Minimum margin ratio for identifying centered text to exclude * @param options.minWordCount - Minimum word count threshold for valid prose lines * @returns Average word density (words per pixel) for prose content, or 0 if no prose found */ const resolveCenteringOptions = (options) => resolveWithDefaults({ centerToleranceRatio: DEFAULT_CENTER_TOLERANCE, minMarginRatio: DEFAULT_MIN_MARGIN }, options); const getWordCount = (text) => { const normalized = text.replace(NBSP_PATTERN, " ").replace(TATWEEL_PATTERN, "").trim(); if (!normalized) return 0; return normalized.match(NON_WHITESPACE_PATTERN)?.length ?? 0; }; const isNumericOnlyToken = (text) => { const stripped = text.replace(NBSP_PATTERN, " ").replace(TATWEEL_PATTERN, "").replace(STRIP_PUNCTUATION_SYMBOLS_AND_SPACE_PATTERN, ""); return stripped.length > 0 && ARABIC_OR_LATIN_DIGITS_PATTERN.test(stripped); }; const hasCompatiblePairWidths = (obs1, obs2, pairWidthSimilarityRatio) => { const avgWidth = (obs1.bbox.width + obs2.bbox.width) / 2; return { avgWidth, isCompatible: Math.abs(obs1.bbox.width - obs2.bbox.width) / avgWidth < pairWidthSimilarityRatio }; }; const hasCompatibleWordCounts = (words1, words2, pairWordCountSimilarityRatio) => { const maxWords = Math.max(words1, words2); return Math.abs(words1 - words2) / maxWords < pairWordCountSimilarityRatio; }; const hasCompatibleVerticalGap = (obs1, obs2, maxVerticalGapRatio) => { const centerY1 = obs1.bbox.y + obs1.bbox.height / 2; const centerY2 = obs2.bbox.y + obs2.bbox.height / 2; return Math.abs(centerY1 - centerY2) <= maxVerticalGapRatio * ((obs1.bbox.height + obs2.bbox.height) / 2); }; const getOrderedPairObservations = (obs1, obs2) => { const leftObs = obs1.bbox.x < obs2.bbox.x ? obs1 : obs2; const rightObs = obs1.bbox.x < obs2.bbox.x ? obs2 : obs1; return { gap: rightObs.bbox.x - (leftObs.bbox.x + leftObs.bbox.width), leftObs, rightObs }; }; const hasAsymmetricSparseGap = (leftObs, rightObs, gap, avgWidth, imageWidth) => { const pageCenter = imageWidth / 2; const innerLeft = leftObs.bbox.x + leftObs.bbox.width; const innerRight = rightObs.bbox.x; const leftDelta = Math.abs(pageCenter - innerLeft); const rightDelta = Math.abs(innerRight - pageCenter); const asymmetry = Math.abs(leftDelta - rightDelta); return gap > avgWidth * 2 && asymmetry > imageWidth * .12; }; const resolvePairCenteringOptions = (hasSignificantGap, options) => { if (!hasSignificantGap) return resolveCenteringOptions(options); return { ...resolveCenteringOptions(options), centerToleranceRatio: (options.centerToleranceRatio ?? DEFAULT_CENTER_TOLERANCE) * 2.5, minMarginRatio: (options.minMarginRatio ?? DEFAULT_MIN_MARGIN) * .75 }; }; const toCombinedBbox = (obs1, obs2) => { const leftX = Math.min(obs1.bbox.x, obs2.bbox.x); const rightmostPoint = Math.max(obs1.bbox.x + obs1.bbox.width, obs2.bbox.x + obs2.bbox.width); return { height: Math.max(obs1.bbox.y + obs1.bbox.height, obs2.bbox.y + obs2.bbox.height) - Math.min(obs1.bbox.y, obs2.bbox.y), width: rightmostPoint - leftX, x: leftX, y: Math.min(obs1.bbox.y, obs2.bbox.y) }; }; const hasPoetryLikeDensity = (obs, wordCount, imageWidth, avgProseWordDensity, minWidthRatioForMerged, wordDensityComparisonRatio) => { if (obs.bbox.width <= imageWidth * minWidthRatioForMerged || !Number.isFinite(avgProseWordDensity) || avgProseWordDensity <= 0) return false; const obsDensity = wordCount / obs.bbox.width; if (obsDensity <= 0) return false; return obsDensity / avgProseWordDensity < (obs.bbox.width / imageWidth > .75 ? wordDensityComparisonRatio * .95 : .5); }; const calculateAverageProseDensity = (observations, imageWidth, options = DEFAULT_POETRY_OPTIONS) => { const centeringOptions = resolveCenteringOptions(options); const minWordCount = options.minWordCount ?? DEFAULT_MIN_WORD_COUNT; let totalWords = 0; let totalWidth = 0; for (const obs of observations) { const wordCount = getWordCount(obs.text); if (!isObservationCentered(obs.bbox, imageWidth, centeringOptions) && obs.bbox.width > imageWidth * .4 && wordCount >= minWordCount && wordCount <= MAX_PROSE_WORD_COUNT) { totalWords += wordCount; totalWidth += obs.bbox.width; } } if (totalWords <= 0 || totalWidth <= 0) return 0; const density = totalWords / totalWidth; return Number.isFinite(density) && density > 0 ? density : 0; }; /** * Validates if two observations form a poetry pair (hemistichs). * * In traditional poetry, especially Arabic poetry, lines are often split into two * hemistichs that appear as separate text observations. This function checks if * two observations meet the criteria for being poetry hemistichs based on: * - Similar width (indicating balanced structure) * - Similar word count (indicating rhythmic balance) * - Overall centering when combined (typical poetry layout) * - Minimum word count threshold (filtering noise) * * @param obs1 - First observation (potential first hemistich) * @param obs2 - Second observation (potential second hemistich) * @param imageWidth - Total width of the document/image in pixels * @param options - Poetry detection configuration options * @returns True if the observations form a valid poetry pair */ const isPoetryPair = (obs1, obs2, imageWidth, options = DEFAULT_POETRY_OPTIONS) => { const minWordCount = options.minWordCount ?? DEFAULT_MIN_WORD_COUNT; const maxVerticalGapRatio = options.maxVerticalGapRatio ?? DEFAULT_MAX_VERTICAL_GAP_RATIO; const pairWidthSimilarityRatio = options.pairWidthSimilarityRatio ?? DEFAULT_PAIR_WIDTH_SIMILARITY; const pairWordCountSimilarityRatio = options.pairWordCountSimilarityRatio ?? DEFAULT_PAIR_WORD_SIMILARITY; const words1 = getWordCount(obs1.text); const words2 = getWordCount(obs2.text); if (words1 < minWordCount || words2 < minWordCount) return false; const { avgWidth, isCompatible: hasCompatibleWidths } = hasCompatiblePairWidths(obs1, obs2, pairWidthSimilarityRatio); if (!hasCompatibleWidths) return false; if (!hasCompatibleWordCounts(words1, words2, pairWordCountSimilarityRatio)) return false; if (!hasCompatibleVerticalGap(obs1, obs2, maxVerticalGapRatio)) return false; const { gap, leftObs, rightObs } = getOrderedPairObservations(obs1, obs2); if (isNumericOnlyToken(leftObs.text)) return false; const hasSignificantGap = gap > imageWidth * .07 || gap > avgWidth * .15; if (hasSignificantGap && hasAsymmetricSparseGap(leftObs, rightObs, gap, avgWidth, imageWidth)) return false; const centeringOptions = resolvePairCenteringOptions(hasSignificantGap, options); return isObservationCentered(toCombinedBbox(obs1, obs2), imageWidth, centeringOptions); }; /** * Determines if a single observation represents a wide poetic line. * * Some poetry appears as single wide lines rather than split hemistichs. * These lines are identified by: * - Being centered on the page * - Having sufficient width (not just short fragments) * - Having lower word density compared to prose (more spaced out) * - Meeting minimum word count requirements * * The word density comparison helps distinguish poetry from prose: poetry * typically has more spacing between words and shorter lines relative to * the number of words, resulting in lower words-per-pixel density. * * @param obs - The observation to analyze * @param imageWidth - Total width of the document/image in pixels * @param avgProseWordDensity - Average word density of prose content for comparison * @param options - Poetry detection configuration options * @returns True if the observation represents a wide poetic line */ const isWidePoeticLine = (obs, imageWidth, avgProseWordDensity, options = DEFAULT_POETRY_OPTIONS) => { const wordCount = getWordCount(obs.text); const minWordCount = options.minWordCount ?? DEFAULT_MIN_WORD_COUNT; const wordDensityComparisonRatio = options.wordDensityComparisonRatio ?? DEFAULT_DENSITY_RATIO; if (wordCount < minWordCount) return false; if (PROSE_PUNCTUATION_PATTERN.test(obs.text)) return false; if (!isObservationCentered(obs.bbox, imageWidth, resolveCenteringOptions(options))) return false; return hasPoetryLikeDensity(obs, wordCount, imageWidth, avgProseWordDensity, options.minWidthRatioForMerged ?? DEFAULT_MIN_WIDTH_RATIO_FOR_MERGED, wordDensityComparisonRatio); }; /** * Determines if a group of observations represents poetic content. * * This function handles the two main patterns of poetry layout: * 1. Single wide lines: Complete poetic lines that appear as one observation * 2. Hemistich pairs: Poetry lines split into two balanced parts (hemistichs) * * For single observations, it checks if the line is wide, centered, and has * low word density compared to prose content in the document. * * For pairs of observations, it validates them as poetry hemistichs based on * width similarity, word count similarity, and overall centering when combined. * * Groups with more than 2 observations are not considered poetic content * as they don't match common poetry formatting patterns. * * @param group - Array of observations to analyze (typically 1-2 items for poetry) * @param imageWidth - Total width of the document/image in pixels * @param avgProseWordDensity - Average word density of prose content for comparison * @param options - Poetry detection configuration options * @returns True if the group represents poetic content */ const isPoeticGroup = (group, imageWidth, avgProseWordDensity, options = DEFAULT_POETRY_OPTIONS) => { const minWidthRatioForMerged = options.minWidthRatioForMerged ?? DEFAULT_MIN_WIDTH_RATIO_FOR_MERGED; if (group.length === 1 && minWidthRatioForMerged !== null) return isWidePoeticLine(group[0], imageWidth, avgProseWordDensity, { ...options, minWidthRatioForMerged }); if (group.length === 2) return isPoetryPair(group[0], group[1], imageWidth, options); return false; }; //#endregion //#region src/utils/paragraphs.ts const DEFAULT_PARAGRAPH_OPTIONS = { verticalJumpFactor: 2, widthTolerance: .85 }; const isPoetryPairGroup = (group) => group.length === 2 && group.every((item) => item.isPoetic); /** * Preprocesses observations by filtering noise, flipping coordinates for RTL text, * and normalizing x-coordinates for proper alignment. * * @param observations - Array of text observations to preprocess * @param imageWidth - Total width of the document/image in pixels * @param dpiX - Horizontal DPI for coordinate normalization * @param options - Optional logging configuration * @returns Preprocessed observations ready for line grouping */ const flipAndAlignObservations = (observations, imageWidth, dpiX, options = {}) => { observations = observations.filter(filterNoisy