kokokor
Version:
A lightweight TypeScript library designed to reconstruct paragraphs from OCRed inputs.
1,113 lines (1,106 loc) • 56.4 kB
JavaScript
//#region src/utils/constants.ts
/**
* Conversion factor from points to inches.
* In typography, there are 72 points in one inch.
*/
const PTS_TO_INCHES = 72;
/**
* Default options for poetry detection, providing a balanced starting point.
* These values have been tuned to work well across various document types
* while minimizing false positives and negatives in poetry identification.
*/
const DEFAULT_POETRY_OPTIONS = {
centerToleranceRatio: .05,
maxVerticalGapRatio: 2,
minMarginRatio: .1,
minWidthRatioForMerged: .6,
minWordCount: 2,
pairWidthSimilarityRatio: .4,
pairWordCountSimilarityRatio: .5,
wordDensityComparisonRatio: .95
};
/**
* Default configuration options for mapping OCR observations to text lines.
* These settings provide reasonable defaults for most document processing scenarios.
*/
const DEFAULT_OBSERVATIONS_TO_TEXT_LINES_OPTIONS = {
centerToleranceRatio: .05,
horizontalLines: [],
isRTL: true,
minMarginRatio: .2,
pixelTolerance: 5,
poetryDetectionOptions: DEFAULT_POETRY_OPTIONS,
poetryPairDelimiter: " ",
rectangles: []
};
/**
* Maximum number of words expected in a typical prose line.
* Lines exceeding this count may indicate formatting issues or merged content
* that should be split during text processing.
*/
const MAX_PROSE_WORD_COUNT = 25;
/**
* A regex pattern to detect common prose punctuation.
* This includes:
* - Arabic and English commas (، ,)
* - Arabic and English semicolons (؛ ;)
* - Arabic and English question marks (؟ ?)
* - Arabic and English periods/full stops (۔ .)
* - Colons (:)
* - Parentheses (())
*/
const PROSE_PUNCTUATION_PATTERN = /[،,؛;؟?۔.:()]/;
/**
* Percentile used as the robust reference width for paragraph grouping.
*/
const PARAGRAPH_WIDTH_PERCENTILE = .75;
/**
* Percentile used for deriving the right-edge x baseline from candidate lines.
*/
const PARAGRAPH_BASELINE_PERCENTILE = .25;
/**
* Ratio of reference width used to detect right-edge indentation.
* This is the coarse page-scale component of indentation detection and is
* combined with a line-height floor for DPI resilience.
*/
const PARAGRAPH_INDENT_THRESHOLD_RATIO = .04;
/**
* Minimum pixel distance required before classifying a line as indented.
*
* This value is calibrated around 72 DPI coordinate spaces:
* - 3 px @ 72 DPI ~= 0.042 in (~1.06 mm)
* It acts only as a hard floor for tiny/noisy line-height inputs.
*/
const PARAGRAPH_MIN_INDENT_PX = 3;
/**
* Minimum indent floor expressed as a ratio of typical line height.
* Helps keep indentation thresholds stable across different coordinate scales.
*/
const PARAGRAPH_MIN_INDENT_HEIGHT_RATIO = .15;
/**
* Minimum width ratio for lines that can participate in indentation checks.
*/
const PARAGRAPH_MIN_INDENT_CANDIDATE_WIDTH_RATIO = .7;
//#endregion
//#region src/utils/grouping.ts
/**
* Groups items by their assigned index value into separate arrays.
*
* This function takes an array of indexed items and organizes them into subarrays
* based on their index property, which typically represents lines, paragraphs, or
* other logical groupings. The index property is removed from the resulting items.
*
* @template T - Type extending an object with a numeric index property
* @param items - Array of items with index properties to be grouped
* @returns An array of item groups, where each group contains items with the same index
*
* @example
* ```typescript
* const items = [
* { text: "Hello", index: 0 },
* { text: "World", index: 0 },
* { text: "Goodbye", index: 1 }
* ];
* const groups = groupByIndex(items);
* // Result: [
* // [{ text: "Hello" }, { text: "World" }],
* // [{ text: "Goodbye" }]
* // ]
* ```
*/
const groupByIndex = (items) => {
const groups = [];
for (const { index, ...item } of items) {
if (!groups[index]) groups[index] = [];
groups[index].push(item);
}
return groups;
};
/**
* Sorts items within each group horizontally by their x-coordinate.
*
* This ensures proper reading order (left-to-right for LTR languages) for items
* within the same line or group. The function creates a copy of the input array
* to avoid modifying the original data structure.
*
* @template T - Type extending an object with a bbox containing x-coordinate
* @param grouped - Array of item groups to be sorted horizontally
* @returns A new array with the same structure but with items sorted by x-coordinate within each group
*
* @example
* ```typescript
* const groups = [
* [{ bbox: { x: 100 }, text: "World" }, { bbox: { x: 50 }, text: "Hello" }]
* ];
* const sorted = sortGroupsHorizontally(groups);
* // Result: [[{ bbox: { x: 50 }, text: "Hello" }, { bbox: { x: 100 }, text: "World" }]]
* ```
*/
const sortGroupsHorizontally = (grouped) => {
const groups = grouped.slice();
for (let i = 0; i < groups.length; i++) groups[i] = groups[i].toSorted((a, b) => a.bbox.x - b.bbox.x);
return groups;
};
/**
* Merges the group of observations into a single one.
* @param group The group of observations to merge.
* @param delimiter Text delimiter used when concatenating observations.
* @returns A single observation with the text of the group concatenated as well as the bounding box adjusted to fit all of the contents.
*/
const mergeObservations = (group, delimiter = " ") => {
let minX = group[0].bbox.x;
let minY = group[0].bbox.y;
let maxX = group[0].bbox.x + group[0].bbox.width;
let maxY = group[0].bbox.y + group[0].bbox.height;
let combinedText = group[0].text;
for (let i = 1; i < group.length; i++) {
const { bbox, text } = group[i];
minX = Math.min(minX, bbox.x);
minY = Math.min(minY, bbox.y);
maxX = Math.max(maxX, bbox.x + bbox.width);
maxY = Math.max(maxY, bbox.y + bbox.height);
combinedText += `${delimiter}${text}`;
}
return {
...group[0],
bbox: {
height: maxY - minY,
width: maxX - minX,
x: minX,
y: minY
},
text: combinedText
};
};
/**
* Merges multiple observations within each group into a single combined observation.
*
* For each group, this function performs the following operations:
* 1. Calculates a combined bounding box that encompasses all observations in the group
* 2. Concatenates the text content of all observations with spaces between them
* 3. Preserves all additional properties from the first observation in the group
*
* This is typically used to combine individual word-level OCR results into complete
* lines or to merge line segments into full paragraphs.
*
* @template T - Type extending Observation (must have bbox and text properties)
* @param grouped - Array of observation groups to be merged
* @returns An array of merged observations, where each represents a complete line or paragraph
*
* @example
* ```typescript
* const groups = [
* [
* { bbox: { x: 0, y: 0, width: 50, height: 20 }, text: "Hello" },
* { bbox: { x: 60, y: 0, width: 50, height: 20 }, text: "world" }
* ]
* ];
* const merged = mergeGroupedObservations(groups);
* // Result: [{
* // bbox: { x: 0, y: 0, width: 110, height: 20 },
* // text: "Hello world"
* // }]
* ```
*/
const mergeGroupedObservations = (grouped) => {
const result = [];
for (const group of grouped) {
if (group.length === 1) {
result.push(group[0]);
continue;
}
result.push(mergeObservations(group));
}
return result;
};
//#endregion
//#region src/utils/layout.ts
/**
* Determines if an observation is centered on the page with sufficient whitespace around it.
*
* An observation is considered centered if:
* 1. Its center point is within tolerance of the page center
* 2. It has sufficient margins (whitespace) on both left and right sides
*
* This prevents false positives where wide observations span most of the page
* but happen to have their center point near the page center.
*
* @param bbox - The bounding box to check for centering
* @param imageWidth - The total width of the page/image in pixels
* @param options - Configuration options for centering criteria
* @param options.centerToleranceRatio - The tolerance for center point alignment as a ratio of image width (default: 0.05 = 5%)
* @param options.minMarginRatio - The minimum margin required on each side as a ratio of image width (default: 0.1 = 10%)
* @returns True if the observation is centered with sufficient whitespace, false otherwise
*
* @example
* ```typescript
* // Using default options
* isObservationCentered({ width: 286, x: 298 }, 960, { centerToleranceRatio: 0.05, minMarginRatio: 0.1 }) // true
*
* // Using custom options for stricter centering
* isObservationCentered(
* { width: 286, x: 298 },
* 960,
* { centerToleranceRatio: 0.02, minMarginRatio: 0.15 }
* )
*
* // A wide observation spanning most of the page - should return false
* isObservationCentered({ width: 2026, x: 232 }, 2481, { centerToleranceRatio: 0.05, minMarginRatio: 0.1 }) // false
* ```
*/
const isObservationCentered = (bbox, imageWidth, options) => {
const pageCenter = imageWidth / 2;
const tolPx = imageWidth * options.centerToleranceRatio;
const centerX = bbox.x + bbox.width / 2;
const isCenterPointCentered = Math.abs(centerX - pageCenter) <= tolPx;
const leftMargin = bbox.x;
const rightMargin = imageWidth - (bbox.x + bbox.width);
const minMargin = imageWidth * options.minMarginRatio;
return isCenterPointCentered && leftMargin >= minMargin && rightMargin >= minMargin;
};
/**
* Filters out horizontal lines that are contained within any of the provided rectangles.
*
* This is useful for removing header/footer lines that appear within document sections
* while preserving lines that mark true document boundaries or section separators.
*
* @param rectangles - Array of rectangles to check containment against
* @param horizontalLines - Array of horizontal lines to filter
* @param tolerance - Pixel tolerance for boundary checking (default: 5)
* @returns Array of horizontal lines that are NOT contained within any rectangle
*/
const filterHorizontalLinesOutsideRectangles = (rectangles, horizontalLines, tolerance = 5) => {
return horizontalLines.filter((line) => {
return !rectangles.some((rect) => {
return isBoundingBoxContained(line, rect, tolerance);
});
});
};
/**
* Finds the y-coordinate of the last horizontal line that's not contained within any rectangle.
*
* Used to identify the footer boundary - text below this line is typically footnotes.
* Filters out horizontal lines that are contained within rectangles before finding the last one.
*
* @param rectangles - Array of rectangles to exclude horizontal lines from
* @param horizontalLines - Array of horizontal lines to consider
* @param pixelTolerance - Pixel tolerance for containment checking (default: 5)
* @returns Y-coordinate of the last qualifying horizontal line, or undefined if none found
*/
const getLastHorizontalLineY = (rectangles, horizontalLines, pixelTolerance = 5) => {
if (rectangles.length > 0 && horizontalLines.length > 0) horizontalLines = filterHorizontalLinesOutsideRectangles(rectangles, horizontalLines, pixelTolerance);
horizontalLines = horizontalLines.filter((line) => line.y > pixelTolerance);
return horizontalLines.at(-1)?.y;
};
/**
* Checks if a bounding box is contained within another bounding box with tolerance.
*
* The tolerance extends the outer bounding box in all directions, making containment
* checking more lenient for cases where elements might be slightly outside due to
* OCR inaccuracies or minor positioning variations.
*
* @param inner - The bounding box to check if it's inside
* @param outer - The bounding box to check if it contains the inner box
* @param tolerance - The pixel tolerance for boundary checking (extends outer box boundaries)
* @returns True if the inner bounding box is contained within the outer bounding box (with tolerance)
*/
const isBoundingBoxContained = (inner, outer, tolerance) => {
const outerLeft = outer.x - tolerance;
const outerRight = outer.x + outer.width + tolerance;
const outerTop = outer.y - tolerance;
const outerBottom = outer.y + outer.height + tolerance;
const innerLeft = inner.x;
const innerRight = inner.x + inner.width;
const innerTop = inner.y;
const innerBottom = inner.y + inner.height;
return innerLeft >= outerLeft && innerRight <= outerRight && innerTop >= outerTop && innerBottom <= outerBottom;
};
/**
* Converts bounding box coordinates from array format to object format.
* Transforms [x1, y1, x2, y2] coordinates to {x, y, width, height} format.
*
* @param box - Array containing [x1, y1, x2, y2] coordinates where (x1,y1) is top-left and (x2,y2) is bottom-right
* @returns Bounding box object with x, y, width, and height properties
*/
const mapMatrixToBoundingBox = (box) => {
const [x1, y1, x2, y2] = box;
return {
height: y2 - y1,
width: x2 - x1,
x: x1,
y: y1
};
};
/**
* Analyzes the typical line spacing in the document to determine
* what constitutes a normal gap vs. an intra-line gap.
*
* This analysis helps distinguish between text that belongs on the same logical line
* but was split by OCR, versus text that represents separate lines. The function
* calculates percentiles of vertical gaps to establish thresholds.
*
* @param sortedItems - Array of observations sorted by y-coordinate (top to bottom)
* @returns Object containing typical gap size and minimum intra-line gap threshold
* @returns returns.typicalGap - The 75th percentile gap size, representing normal line spacing
* @returns returns.minIntraLineGap - Threshold below which gaps are considered intra-line
*/
const analyzeLineSpacing = (sortedItems) => {
const len = sortedItems.length;
if (len < 3) return {
minIntraLineGap: 0,
typicalGap: 0
};
const gaps = new Array(len - 1);
for (let i = 1; i < len; i++) gaps[i - 1] = sortedItems[i].bbox.y - sortedItems[i - 1].bbox.y;
gaps.sort((a, b) => a - b);
const medianIdx = Math.floor(gaps.length * .5);
const p75Idx = Math.floor(gaps.length * .75);
const medianGap = gaps[medianIdx];
const typicalGap = gaps[p75Idx];
return {
minIntraLineGap: Math.min(medianGap * .6, typicalGap * .4),
typicalGap
};
};
/**
* Computes an adaptive line height factor based on item heights and spacing patterns.
*
* The line height factor is used to determine how much vertical space to allow when
* grouping text observations into lines. A smaller factor groups items more aggressively,
* while a larger factor is more conservative about grouping.
*
* @param heights - Array of heights from bounding box properties
* @param typicalGap - Typical vertical gap between lines in the document (from analyzeLineSpacing)
* @returns Adaptive line height factor between 0.15 and 0.4
* - 0.15: Small gaps relative to text height (likely intra-line groupings)
* - 0.25: Medium gaps (standard line spacing)
* - 0.4: Large gaps (widely spaced separate lines)
*/
const computeAdaptiveLineHeightFactor = (heights, typicalGap) => {
if (heights.length === 0) return .3;
let totalHeight = 0;
for (const height of heights) totalHeight += height;
const gapToHeightRatio = typicalGap / (totalHeight / heights.length);
if (gapToHeightRatio < .8) return .15;
if (gapToHeightRatio < 1.2) return .25;
return .4;
};
//#endregion
//#region src/utils/marking.ts
/**
* Minimum number of left-edge candidates required before list-start heuristics activate.
*/
const LIST_START_MIN_CANDIDATES = 3;
/**
* Minimum relative vertical gap between consecutive list starts.
*/
const LIST_START_GAP_HEIGHT_FACTOR = .9;
/**
* Smaller indentation threshold used for repeated list-start lines.
*/
const LIST_START_INDENT_THRESHOLD_RATIO = .03;
/**
* Low percentile for detecting a stable left-edge baseline for list starts.
*/
const LIST_START_BASELINE_PERCENTILE = .1;
/**
* Number of short indented continuation lines needed to confirm list topology.
*/
const LIST_START_MIN_SHORT_INDENTED_LINES = 2;
/**
* Determines if two consecutive items should be placed on separate lines based on spacing analysis.
*
* This function uses multiple criteria to determine line breaks:
* - Primary threshold based on average height and line height factor
* - Secondary check using document-wide spacing patterns
* - DPI-adjusted tolerance for consistent behavior across different resolutions
*
* @template T - Type extending an object with a bounding box
* @param prev - Previous item in the sequence
* @param current - Current item being evaluated
* @param effectiveFactor - Line height factor multiplier for threshold calculation
* @param effectiveYTolerance - DPI-adjusted vertical tolerance in pixels
* @param spacingAnalysis - Document spacing analysis containing gap measurements
* @returns True if items should be placed on separate lines, false otherwise
*/
const shouldSeparateLines = (prev, current, effectiveFactor, effectiveYTolerance, spacingAnalysis) => {
const dy = current.bbox.y - prev.bbox.y;
const avgHeight = (prev.bbox.height + current.bbox.height) * .5;
let shouldSeparate = dy > avgHeight * effectiveFactor + effectiveYTolerance;
if (!shouldSeparate && spacingAnalysis.minIntraLineGap > 0 && dy > spacingAnalysis.minIntraLineGap) shouldSeparate = dy > Math.min(avgHeight * .2, spacingAnalysis.minIntraLineGap);
return shouldSeparate;
};
/**
* Processes sorted items and assigns line indices based on vertical spacing.
*
* This function iterates through vertically sorted items and assigns line numbers
* based on spacing analysis. Items that are close enough vertically are assigned
* to the same line, while items with significant vertical gaps start new lines.
*
* @template T - Type extending an object with a bounding box
* @param sortedItems - Array of items sorted by y-coordinate (top to bottom)
* @param effectiveFactor - Line height factor to use for threshold calculations
* @param effectiveYTolerance - DPI-adjusted vertical tolerance in pixels
* @param spacingAnalysis - Document spacing analysis results
* @returns Array of items with assigned line index properties
*/
const assignLineIndices = (sortedItems, effectiveFactor, effectiveYTolerance, spacingAnalysis) => {
const len = sortedItems.length;
const marked = new Array(len);
let currentLine = 0;
let prev = sortedItems[0];
marked[0] = {
...prev,
index: currentLine
};
for (let i = 1; i < len; i++) {
const item = sortedItems[i];
if (shouldSeparateLines(prev, item, effectiveFactor, effectiveYTolerance, spacingAnalysis)) currentLine += 1;
marked[i] = {
...item,
index: currentLine
};
prev = item;
}
return marked;
};
/**
* Groups items into lines based on vertical proximity and document spacing patterns.
*
* This function implements an adaptive line detection algorithm that analyzes the document's
* spacing patterns to distinguish between separate lines and text elements that belong on
* the same line. The algorithm:
*
* 1. Sorts items by y-coordinate (top to bottom)
* 2. Analyzes document-wide spacing patterns (unless lineHeightFactor is provided)
* 3. Computes adaptive thresholds based on item heights and spacing analysis
* 4. Assigns line indices based on vertical proximity
* 5. Returns items sorted by line index, then by y-coordinate
*
* Two items are considered to be on the same line if the vertical distance between them
* is less than a dynamically computed threshold based on:
* - Average height of the items
* - Adaptive line height factor (computed from document patterns or provided)
* - DPI-adjusted pixel tolerance
* - Document-wide spacing analysis
*
* @template T - Type extending an object with a bounding box
* @param items - Array of items to be grouped into lines
* @param dpi - Document DPI (dots per inch) for scaling tolerance values appropriately
* @param pixelTolerance - Additional vertical tolerance in pixels at 72 DPI
* @param lineHeightFactor - Optional fixed line height factor. If not provided, computed adaptively from document patterns
* @returns Array of items with index properties indicating line assignments, sorted by line then y-coordinate
*
* @example
* ```typescript
* const observations = [
* { bbox: { x: 0, y: 0, width: 100, height: 20 }, text: "First line" },
* { bbox: { x: 0, y: 25, width: 100, height: 20 }, text: "Second line" }
* ];
* const lines = indexItemsAsLines(observations, 300, 5);
* // Result: Items with index: 0 for first line, index: 1 for second line
* ```
*/
const indexItemsAsLines = (items, dpi, pixelTolerance, lineHeightFactor) => {
const byY = items.toSorted((a, b) => a.bbox.y - b.bbox.y);
const effectiveYTolerance = pixelTolerance * (dpi / PTS_TO_INCHES);
const spacingAnalysis = lineHeightFactor ? {
minIntraLineGap: 0,
typicalGap: 0
} : analyzeLineSpacing(byY);
return assignLineIndices(byY, lineHeightFactor || computeAdaptiveLineHeightFactor(items.map((i) => i.bbox.height), spacingAnalysis.typicalGap), effectiveYTolerance, spacingAnalysis).toSorted((a, b) => a.index !== b.index ? a.index - b.index : a.bbox.y - b.bbox.y);
};
/**
* Calculates the DPI (dots per inch) based on image dimensions and original PDF size.
*
* This utility function helps determine the resolution at which a PDF was rasterized
* by comparing the resulting image dimensions with the original PDF page dimensions.
* The DPI values are essential for proper scaling of pixel-based tolerances and
* measurements throughout the document processing pipeline.
*
* @param imageSize - Dimensions of the rasterized image in pixels
* @param pdfSize - Original dimensions of the PDF page in points (1/72 inch)
* @returns Object containing x and y DPI values
*
* @example
* ```typescript
* const imageSize = { width: 2480, height: 3508 };
* const pdfSize = { width: 595, height: 842 }; // A4 page in points
* const dpi = calculateDPI(imageSize, pdfSize);
* // Result: { x: 300, y: 300 } for a 300 DPI scan
* ```
*/
const calculateDPI = (imageSize, pdfSize) => {
return {
x: imageSize.width / (pdfSize.width / PTS_TO_INCHES),
y: imageSize.height / (pdfSize.height / PTS_TO_INCHES)
};
};
/**
* Returns a percentile value from a sorted numeric array.
*/
const pickPercentile = (sortedValues, percentile) => {
return sortedValues[Math.min(sortedValues.length - 1, Math.max(0, Math.floor((sortedValues.length - 1) * percentile)))];
};
/**
* Returns true when line start is indented relative to baseline.
*/
const isIndentedLine = (item, baselineX, indentThreshold) => item.bbox.x - baselineX > indentThreshold;
/**
* Returns true when a list-start candidate line is near the start baseline and sufficiently wide.
*/
const isListStartCandidate = (item, baselineX, indentThreshold, minWidth) => !isIndentedLine(item, baselineX, indentThreshold) && item.bbox.width >= minWidth;
const computeReferenceWidth = (items) => {
const widths = items.map((item) => item.bbox.width).toSorted((a, b) => a - b);
return widths.length >= 4 ? pickPercentile(widths, PARAGRAPH_WIDTH_PERCENTILE) : widths[widths.length - 1];
};
const computeBaselineX = (items, minIndentCandidateWidth) => {
const baselineCandidates = items.filter((item) => item.bbox.width >= minIndentCandidateWidth).map((item) => item.bbox.x).toSorted((a, b) => a - b);
const allX = items.map((item) => item.bbox.x).toSorted((a, b) => a - b);
return {
allX,
baselineX: pickPercentile(baselineCandidates.length > 0 ? baselineCandidates : allX, PARAGRAPH_BASELINE_PERCENTILE)
};
};
const computeIndentFloor = (items) => {
const typicalLineHeight = pickPercentile(items.map((item) => item.bbox.height).toSorted((a, b) => a - b), .5);
return Math.max(PARAGRAPH_MIN_INDENT_PX, typicalLineHeight * PARAGRAPH_MIN_INDENT_HEIGHT_RATIO);
};
/**
* Detects repeated list-start geometry (e.g., numbered footnote items) without
* depending on semantic markers such as `isFootnote` or regex prefixes.
*
* The signal activates only when we observe:
* - multiple near-baseline list-start candidates,
* - short indented continuation lines,
* - at least one bridge pattern (start -> continuation -> start),
* - and short lines present in the block.
*/
const shouldUseListStartSignal = (items, thresholdWidth, minIndentCandidateWidth, listStartBaselineX, listStartIndentThreshold) => {
const listStartCandidateCount = items.filter((item) => isListStartCandidate(item, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth)).length;
const shortIndentedLineCount = items.filter((item) => item.bbox.width < minIndentCandidateWidth && isIndentedLine(item, listStartBaselineX, listStartIndentThreshold)).length;
const hasListBridge = items.some((item, i) => {
if (i === 0 || i === items.length - 1) return false;
const prev = items[i - 1];
const next = items[i + 1];
return item.bbox.width < minIndentCandidateWidth && isIndentedLine(item, listStartBaselineX, listStartIndentThreshold) && isListStartCandidate(prev, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth) && isListStartCandidate(next, listStartBaselineX, listStartIndentThreshold, minIndentCandidateWidth);
});
return listStartCandidateCount >= LIST_START_MIN_CANDIDATES && shortIndentedLineCount >= LIST_START_MIN_SHORT_INDENTED_LINES && hasListBridge && items.some((item) => item.bbox.width < thresholdWidth);
};
const buildParagraphMetrics = (items, widthTolerance) => {
const referenceWidth = computeReferenceWidth(items);
const thresholdWidth = referenceWidth * widthTolerance;
const minIndentCandidateWidth = thresholdWidth * PARAGRAPH_MIN_INDENT_CANDIDATE_WIDTH_RATIO;
const { allX, baselineX } = computeBaselineX(items, minIndentCandidateWidth);
const listStartBaselineX = pickPercentile(allX, LIST_START_BASELINE_PERCENTILE);
const indentFloor = computeIndentFloor(items);
const indentThreshold = Math.max(referenceWidth * PARAGRAPH_INDENT_THRESHOLD_RATIO, indentFloor);
const listStartIndentThreshold = Math.max(referenceWidth * LIST_START_INDENT_THRESHOLD_RATIO, indentFloor);
return {
baselineX,
indentThreshold,
listStartBaselineX,
listStartIndentThreshold,
minIndentCandidateWidth,
shouldUseListStartSignal: shouldUseListStartSignal(items, thresholdWidth, minIndentCandidateWidth, listStartBaselineX, listStartIndentThreshold),
thresholdWidth
};
};
const hasVerticalBreakSignal = (items, index, thresholdWidth, verticalJumpFactor) => {
if (index === 0) return false;
const item = items[index];
const prev = items[index - 1];
if (index === 1) {
if (prev.bbox.width < thresholdWidth) return false;
return item.bbox.y - prev.bbox.y > prev.bbox.height * verticalJumpFactor;
}
const prevPrev = items[index - 2];
if (prev.bbox.width < thresholdWidth || prevPrev.bbox.width < thresholdWidth) return false;
const gap = item.bbox.y - prev.bbox.y;
const prevGap = prev.bbox.y - prevPrev.bbox.y;
if (prevGap > 0) return gap > prevGap * verticalJumpFactor;
return prevGap === 0 && gap > 0 && gap > item.bbox.height * .5 * verticalJumpFactor;
};
const hasIndentBreakSignal = (items, index, metrics) => {
if (index === 0) return false;
const item = items[index];
const prev = items[index - 1];
const isCurrentIndented = isIndentedLine(item, metrics.baselineX, metrics.indentThreshold);
const wasPrevShort = prev.bbox.width < metrics.thresholdWidth;
if (!isCurrentIndented || wasPrevShort || item.bbox.width < metrics.minIndentCandidateWidth) return false;
return !isIndentedLine(prev, metrics.baselineX, metrics.indentThreshold);
};
const hasListStartBreakSignal = (items, index, metrics) => {
if (index === 0 || !metrics.shouldUseListStartSignal) return false;
const item = items[index];
const prev = items[index - 1];
const isCurrentListStart = isListStartCandidate(item, metrics.listStartBaselineX, metrics.listStartIndentThreshold, metrics.minIndentCandidateWidth);
const isPrevListStart = isListStartCandidate(prev, metrics.listStartBaselineX, metrics.listStartIndentThreshold, metrics.minIndentCandidateWidth);
const wasPrevShort = prev.bbox.width < metrics.thresholdWidth;
const gap = item.bbox.y - prev.bbox.y;
const minGapForListStart = Math.min(prev.bbox.height, item.bbox.height) * LIST_START_GAP_HEIGHT_FACTOR;
return isCurrentListStart && isPrevListStart && !wasPrevShort && gap >= minGapForListStart;
};
const resolveBreakReason = (items, index, verticalJumpFactor, metrics) => {
if (index === 0) return null;
if (hasVerticalBreakSignal(items, index, metrics.thresholdWidth, verticalJumpFactor)) return "vertical";
if (hasIndentBreakSignal(items, index, metrics)) return "indent";
if (hasListStartBreakSignal(items, index, metrics)) return "list-start";
return null;
};
const shouldAdvanceAfterShortLine = (item, index, breakReason, thresholdWidth) => {
if (item.bbox.width >= thresholdWidth) return false;
return index === 0 || breakReason !== "indent";
};
/**
* Groups items into paragraphs based on vertical spacing patterns and line width analysis.
*
* This function analyzes vertical spacing between consecutive items and their widths to
* identify paragraph boundaries. The algorithm uses four coordinated signals:
*
* 1. **Vertical jump detection**: A new paragraph starts when there's a significant
* increase in vertical gap compared to previous gaps, but only when both preceding
* lines are "full-width" (not short lines that might indicate natural breaks)
*
* 2. **Indent-start detection**: A line that newly indents from the right-edge baseline
* starts a new paragraph.
*
* 3. **List-start detection**: Repeated left-edge starts with short indented continuations
* are treated as separate list items.
*
* 4. **Short line detection**: Lines significantly narrower than a robust reference width
* are considered paragraph-ending lines, causing the next line to start a new paragraph.
*
* These heuristics work together to handle various paragraph patterns including:
* - Standard paragraphs with consistent spacing
* - Consistently indented paragraph starts
* - Repeated list-start structures (including footnote-style note lists)
* - Paragraphs ending with short lines
* - Headers and subheadings with extra spacing
* - Footer content separated by spacing
*
* @template T - Type extending an object with a bounding box
* @param items - Array of items (typically lines) to be grouped into paragraphs
* @param verticalJumpFactor - Multiplier determining how much larger a gap needs to be to indicate a paragraph break (e.g., 2.0 means 200% larger)
* @param widthTolerance - Fraction of reference width below which a line is considered "short" (0-1, e.g., 0.8 means 80% of reference width)
* @returns Array of items with index properties indicating paragraph assignments, sorted by paragraph then y-coordinate
*
* @example
* ```typescript
* const lines = [
* { bbox: { y: 0, width: 400, height: 20 }, text: "First paragraph line" },
* { bbox: { y: 25, width: 300, height: 20 }, text: "Short line ending" }, // Short line
* { bbox: { y: 55, width: 400, height: 20 }, text: "Second paragraph" } // Gap + new para
* ];
* const paragraphs = indexItemsAsParagraphs(lines, 2.0, 0.8);
* // Result: First two lines index: 0, third line index: 1
* ```
*/
const indexItemsAsParagraphs = (items, verticalJumpFactor, widthTolerance) => {
if (items.length === 0) return [];
const metrics = buildParagraphMetrics(items, widthTolerance);
const out = [];
let index = 0;
for (let i = 0; i < items.length; i++) {
const item = items[i];
const breakReason = resolveBreakReason(items, i, verticalJumpFactor, metrics);
if (breakReason !== null) index++;
out.push({
...item,
index
});
if (shouldAdvanceAfterShortLine(item, i, breakReason, metrics.thresholdWidth)) index++;
}
return out.sort((a, b) => a.index !== b.index ? a.index - b.index : a.bbox.y - b.bbox.y);
};
//#endregion
//#region src/utils/normalization.ts
/**
* Adjusts x-coordinates of observations for right-to-left (RTL) text processing.
*
* This function transforms the coordinate system to accommodate right-to-left text
* direction, which is essential for languages like Arabic, Hebrew, Farsi, and others.
* It flips the x-coordinate so that the rightmost edge becomes the origin (x=0),
* enabling proper text flow analysis for RTL scripts.
*
* The transformation formula: `newX = imageWidth - originalX - textWidth`
*
* @param observations - Array of text observations with bounding box data from OCR
* @param imageWidth - Total width of the document/image in pixels
* @returns A new array of observations with x-coordinates adjusted for RTL text processing
*
* @example
* ```typescript
* const observations = [
* { bbox: { x: 100, y: 0, width: 50, height: 20 }, text: "مرحبا" }
* ];
* const rtlObservations = mapOcrResultToRTLObservations(observations, 800);
* // Result: { bbox: { x: 650, y: 0, width: 50, height: 20 }, text: "مرحبا" }
* // Original x: 100, becomes: 800 - 100 - 50 = 650
* ```
*/
const mapOcrResultToRTLObservations = (observations, imageWidth) => {
return observations.map((o) => ({
...o,
bbox: {
...o.bbox,
x: imageWidth - o.bbox.x - o.bbox.width
}
}));
};
/**
* Filters out noisy or invalid observations based on text content quality.
*
* This function removes observations that are likely to be OCR noise or artifacts
* by checking if the text content meets minimum quality criteria. Currently filters
* out observations with text shorter than 2 characters, which often represent
* punctuation marks, single characters, or OCR errors that don't contribute
* meaningful content to document analysis.
*
* @param o - Single observation to evaluate for noise filtering
* @returns True if the observation should be kept, false if it should be filtered out
*
* @example
* ```typescript
* const observations = [
* { bbox: {...}, text: "Hello world" }, // Kept: length > 1
* { bbox: {...}, text: "." }, // Filtered: length = 1
* { bbox: {...}, text: "" } // Filtered: length = 0
* ];
* const clean = observations.filter(filterNoisyObservations);
* // Result: Only "Hello world" observation remains
* ```
*/
const filterNoisyObservations = (o) => o.text?.replace(/[،,؛;؟?۔.:\-()]/g, "").length > 1;
/**
* Normalizes x-coordinates of observations to create clean alignment.
*
* This function identifies observations that are approximately aligned to the leftmost
* position and standardizes their x-coordinates to create visually consistent, properly
* aligned text blocks. This is particularly useful for correcting minor OCR alignment
* inconsistencies that can occur due to image quality, skew, or OCR engine variations.
*
* The alignment threshold is calculated proportionally to the DPI ratio to ensure
* consistent behavior across different document resolutions. Observations within
* the threshold distance from the leftmost position are snapped to that position.
*
* @param observations - Array of text observations to normalize for alignment
* @param dpi - The dots per inch of the source document used for threshold calculation
* @param standardDPI - The standard DPI to normalize against for consistent thresholds (default: 300)
* @returns A new array of observations with normalized x-coordinates for improved alignment
*
* @example
* ```typescript
* const observations = [
* { bbox: { x: 50, y: 0, width: 100, height: 20 }, text: "Line 1" },
* { bbox: { x: 52, y: 25, width: 100, height: 20 }, text: "Line 2" }, // Slightly off
* { bbox: { x: 100, y: 50, width: 100, height: 20 }, text: "Indented" }
* ];
* const normalized = normalizeObservationsX(observations, 300);
* // Result: First two lines aligned to x: 50, third line unchanged at x: 100
* ```
*/
const normalizeObservationsX = (observations, dpi, standardDPI = 300) => {
const thresholdPx = standardDPI / dpi * 5;
const minX = Math.min(...observations.map((o) => o.bbox.x));
return observations.map((o) => {
if (Math.abs(o.bbox.x - minX) <= thresholdPx) return {
...o,
bbox: {
...o.bbox,
x: minX
}
};
return o;
});
};
/**
* Simplifies an observation for debugging purposes by reducing precision and content.
*
* This utility function creates a simplified version of an observation that's easier
* to read in debug output or logs. It performs two main simplifications:
*
* 1. **Coordinate precision**: Truncates floating-point coordinates to integers
* 2. **Text content**: Filters to words longer than 1 character and keeps only the first word
*
* This is particularly useful when debugging large datasets where full observation
* details would be overwhelming, but you need to understand the general structure
* and positioning of text elements.
*
* @param observation - The observation to simplify for debugging output
* @returns A simplified observation with truncated coordinates and reduced text content
*
* @example
* ```typescript
* const observation = {
* bbox: { x: 123.456, y: 78.901, width: 234.567, height: 19.123 },
* text: "Hello world from OCR engine"
* };
* const simplified = simplifyObservation(observation);
* // Result: {
* // bbox: { x: 123, y: 78, width: 234, height: 19 },
* // text: "Hello"
* // }
* ```
*/
const simplifyObservations = (observations, truncateText = false) => {
return observations.map((observation) => {
return {
bbox: {
height: Math.trunc(observation.bbox.height),
width: Math.trunc(observation.bbox.width),
x: Math.trunc(observation.bbox.x),
y: Math.trunc(observation.bbox.y)
},
text: truncateText ? observation.text.split(" ").filter((word) => word.length > 1).slice(0, 1).join(" ") : observation.text
};
});
};
//#endregion
//#region src/utils/options.ts
/**
* Resolves optional overrides against a default options object.
*
* This is a shallow merge helper intended for option bags where
* top-level keys are merged and nested objects are handled explicitly
* by the caller when needed.
*/
const resolveWithDefaults = (defaults, overrides) => ({
...defaults,
...overrides ? Object.fromEntries(Object.entries(overrides).filter(([, value]) => value !== void 0)) : {}
});
//#endregion
//#region src/utils/poetry.ts
const DEFAULT_CENTER_TOLERANCE = DEFAULT_POETRY_OPTIONS.centerToleranceRatio ?? .05;
const DEFAULT_MIN_MARGIN = DEFAULT_POETRY_OPTIONS.minMarginRatio ?? .1;
const DEFAULT_MIN_WIDTH_RATIO_FOR_MERGED = DEFAULT_POETRY_OPTIONS.minWidthRatioForMerged ?? .6;
const DEFAULT_MIN_WORD_COUNT = DEFAULT_POETRY_OPTIONS.minWordCount ?? 2;
const DEFAULT_PAIR_WIDTH_SIMILARITY = DEFAULT_POETRY_OPTIONS.pairWidthSimilarityRatio ?? .4;
const DEFAULT_PAIR_WORD_SIMILARITY = DEFAULT_POETRY_OPTIONS.pairWordCountSimilarityRatio ?? .5;
const DEFAULT_DENSITY_RATIO = DEFAULT_POETRY_OPTIONS.wordDensityComparisonRatio ?? .95;
const DEFAULT_MAX_VERTICAL_GAP_RATIO = DEFAULT_POETRY_OPTIONS.maxVerticalGapRatio ?? 2;
const NBSP_PATTERN = /\u00A0/g;
const TATWEEL_PATTERN = /\u0640/g;
const NON_WHITESPACE_PATTERN = /\S+/g;
const STRIP_PUNCTUATION_SYMBOLS_AND_SPACE_PATTERN = /[\p{P}\p{S}\s]+/gu;
const ARABIC_OR_LATIN_DIGITS_PATTERN = /^[\d\u0660-\u0669]+$/;
/**
* Calculates the average word density (words per pixel) for prose text in the document.
*
* Filters observations to identify likely prose content by excluding centered text,
* very narrow text, and text with too few or too many words. Used as a baseline
* for poetry detection algorithms that rely on comparing word density patterns.
*
* Prose text typically has higher word density than poetry because prose lines
* extend closer to page margins and contain more words per line.
*
* @param observations - Array of text observations to analyze
* @param imageWidth - Total width of the document/image in pixels
* @param options - Configuration options for prose identification
* @param options.centerToleranceRatio - Tolerance for identifying centered text to exclude
* @param options.minMarginRatio - Minimum margin ratio for identifying centered text to exclude
* @param options.minWordCount - Minimum word count threshold for valid prose lines
* @returns Average word density (words per pixel) for prose content, or 0 if no prose found
*/
const resolveCenteringOptions = (options) => resolveWithDefaults({
centerToleranceRatio: DEFAULT_CENTER_TOLERANCE,
minMarginRatio: DEFAULT_MIN_MARGIN
}, options);
const getWordCount = (text) => {
const normalized = text.replace(NBSP_PATTERN, " ").replace(TATWEEL_PATTERN, "").trim();
if (!normalized) return 0;
return normalized.match(NON_WHITESPACE_PATTERN)?.length ?? 0;
};
const isNumericOnlyToken = (text) => {
const stripped = text.replace(NBSP_PATTERN, " ").replace(TATWEEL_PATTERN, "").replace(STRIP_PUNCTUATION_SYMBOLS_AND_SPACE_PATTERN, "");
return stripped.length > 0 && ARABIC_OR_LATIN_DIGITS_PATTERN.test(stripped);
};
const hasCompatiblePairWidths = (obs1, obs2, pairWidthSimilarityRatio) => {
const avgWidth = (obs1.bbox.width + obs2.bbox.width) / 2;
return {
avgWidth,
isCompatible: Math.abs(obs1.bbox.width - obs2.bbox.width) / avgWidth < pairWidthSimilarityRatio
};
};
const hasCompatibleWordCounts = (words1, words2, pairWordCountSimilarityRatio) => {
const maxWords = Math.max(words1, words2);
return Math.abs(words1 - words2) / maxWords < pairWordCountSimilarityRatio;
};
const hasCompatibleVerticalGap = (obs1, obs2, maxVerticalGapRatio) => {
const centerY1 = obs1.bbox.y + obs1.bbox.height / 2;
const centerY2 = obs2.bbox.y + obs2.bbox.height / 2;
return Math.abs(centerY1 - centerY2) <= maxVerticalGapRatio * ((obs1.bbox.height + obs2.bbox.height) / 2);
};
const getOrderedPairObservations = (obs1, obs2) => {
const leftObs = obs1.bbox.x < obs2.bbox.x ? obs1 : obs2;
const rightObs = obs1.bbox.x < obs2.bbox.x ? obs2 : obs1;
return {
gap: rightObs.bbox.x - (leftObs.bbox.x + leftObs.bbox.width),
leftObs,
rightObs
};
};
const hasAsymmetricSparseGap = (leftObs, rightObs, gap, avgWidth, imageWidth) => {
const pageCenter = imageWidth / 2;
const innerLeft = leftObs.bbox.x + leftObs.bbox.width;
const innerRight = rightObs.bbox.x;
const leftDelta = Math.abs(pageCenter - innerLeft);
const rightDelta = Math.abs(innerRight - pageCenter);
const asymmetry = Math.abs(leftDelta - rightDelta);
return gap > avgWidth * 2 && asymmetry > imageWidth * .12;
};
const resolvePairCenteringOptions = (hasSignificantGap, options) => {
if (!hasSignificantGap) return resolveCenteringOptions(options);
return {
...resolveCenteringOptions(options),
centerToleranceRatio: (options.centerToleranceRatio ?? DEFAULT_CENTER_TOLERANCE) * 2.5,
minMarginRatio: (options.minMarginRatio ?? DEFAULT_MIN_MARGIN) * .75
};
};
const toCombinedBbox = (obs1, obs2) => {
const leftX = Math.min(obs1.bbox.x, obs2.bbox.x);
const rightmostPoint = Math.max(obs1.bbox.x + obs1.bbox.width, obs2.bbox.x + obs2.bbox.width);
return {
height: Math.max(obs1.bbox.y + obs1.bbox.height, obs2.bbox.y + obs2.bbox.height) - Math.min(obs1.bbox.y, obs2.bbox.y),
width: rightmostPoint - leftX,
x: leftX,
y: Math.min(obs1.bbox.y, obs2.bbox.y)
};
};
const hasPoetryLikeDensity = (obs, wordCount, imageWidth, avgProseWordDensity, minWidthRatioForMerged, wordDensityComparisonRatio) => {
if (obs.bbox.width <= imageWidth * minWidthRatioForMerged || !Number.isFinite(avgProseWordDensity) || avgProseWordDensity <= 0) return false;
const obsDensity = wordCount / obs.bbox.width;
if (obsDensity <= 0) return false;
return obsDensity / avgProseWordDensity < (obs.bbox.width / imageWidth > .75 ? wordDensityComparisonRatio * .95 : .5);
};
const calculateAverageProseDensity = (observations, imageWidth, options = DEFAULT_POETRY_OPTIONS) => {
const centeringOptions = resolveCenteringOptions(options);
const minWordCount = options.minWordCount ?? DEFAULT_MIN_WORD_COUNT;
let totalWords = 0;
let totalWidth = 0;
for (const obs of observations) {
const wordCount = getWordCount(obs.text);
if (!isObservationCentered(obs.bbox, imageWidth, centeringOptions) && obs.bbox.width > imageWidth * .4 && wordCount >= minWordCount && wordCount <= MAX_PROSE_WORD_COUNT) {
totalWords += wordCount;
totalWidth += obs.bbox.width;
}
}
if (totalWords <= 0 || totalWidth <= 0) return 0;
const density = totalWords / totalWidth;
return Number.isFinite(density) && density > 0 ? density : 0;
};
/**
* Validates if two observations form a poetry pair (hemistichs).
*
* In traditional poetry, especially Arabic poetry, lines are often split into two
* hemistichs that appear as separate text observations. This function checks if
* two observations meet the criteria for being poetry hemistichs based on:
* - Similar width (indicating balanced structure)
* - Similar word count (indicating rhythmic balance)
* - Overall centering when combined (typical poetry layout)
* - Minimum word count threshold (filtering noise)
*
* @param obs1 - First observation (potential first hemistich)
* @param obs2 - Second observation (potential second hemistich)
* @param imageWidth - Total width of the document/image in pixels
* @param options - Poetry detection configuration options
* @returns True if the observations form a valid poetry pair
*/
const isPoetryPair = (obs1, obs2, imageWidth, options = DEFAULT_POETRY_OPTIONS) => {
const minWordCount = options.minWordCount ?? DEFAULT_MIN_WORD_COUNT;
const maxVerticalGapRatio = options.maxVerticalGapRatio ?? DEFAULT_MAX_VERTICAL_GAP_RATIO;
const pairWidthSimilarityRatio = options.pairWidthSimilarityRatio ?? DEFAULT_PAIR_WIDTH_SIMILARITY;
const pairWordCountSimilarityRatio = options.pairWordCountSimilarityRatio ?? DEFAULT_PAIR_WORD_SIMILARITY;
const words1 = getWordCount(obs1.text);
const words2 = getWordCount(obs2.text);
if (words1 < minWordCount || words2 < minWordCount) return false;
const { avgWidth, isCompatible: hasCompatibleWidths } = hasCompatiblePairWidths(obs1, obs2, pairWidthSimilarityRatio);
if (!hasCompatibleWidths) return false;
if (!hasCompatibleWordCounts(words1, words2, pairWordCountSimilarityRatio)) return false;
if (!hasCompatibleVerticalGap(obs1, obs2, maxVerticalGapRatio)) return false;
const { gap, leftObs, rightObs } = getOrderedPairObservations(obs1, obs2);
if (isNumericOnlyToken(leftObs.text)) return false;
const hasSignificantGap = gap > imageWidth * .07 || gap > avgWidth * .15;
if (hasSignificantGap && hasAsymmetricSparseGap(leftObs, rightObs, gap, avgWidth, imageWidth)) return false;
const centeringOptions = resolvePairCenteringOptions(hasSignificantGap, options);
return isObservationCentered(toCombinedBbox(obs1, obs2), imageWidth, centeringOptions);
};
/**
* Determines if a single observation represents a wide poetic line.
*
* Some poetry appears as single wide lines rather than split hemistichs.
* These lines are identified by:
* - Being centered on the page
* - Having sufficient width (not just short fragments)
* - Having lower word density compared to prose (more spaced out)
* - Meeting minimum word count requirements
*
* The word density comparison helps distinguish poetry from prose: poetry
* typically has more spacing between words and shorter lines relative to
* the number of words, resulting in lower words-per-pixel density.
*
* @param obs - The observation to analyze
* @param imageWidth - Total width of the document/image in pixels
* @param avgProseWordDensity - Average word density of prose content for comparison
* @param options - Poetry detection configuration options
* @returns True if the observation represents a wide poetic line
*/
const isWidePoeticLine = (obs, imageWidth, avgProseWordDensity, options = DEFAULT_POETRY_OPTIONS) => {
const wordCount = getWordCount(obs.text);
const minWordCount = options.minWordCount ?? DEFAULT_MIN_WORD_COUNT;
const wordDensityComparisonRatio = options.wordDensityComparisonRatio ?? DEFAULT_DENSITY_RATIO;
if (wordCount < minWordCount) return false;
if (PROSE_PUNCTUATION_PATTERN.test(obs.text)) return false;
if (!isObservationCentered(obs.bbox, imageWidth, resolveCenteringOptions(options))) return false;
return hasPoetryLikeDensity(obs, wordCount, imageWidth, avgProseWordDensity, options.minWidthRatioForMerged ?? DEFAULT_MIN_WIDTH_RATIO_FOR_MERGED, wordDensityComparisonRatio);
};
/**
* Determines if a group of observations represents poetic content.
*
* This function handles the two main patterns of poetry layout:
* 1. Single wide lines: Complete poetic lines that appear as one observation
* 2. Hemistich pairs: Poetry lines split into two balanced parts (hemistichs)
*
* For single observations, it checks if the line is wide, centered, and has
* low word density compared to prose content in the document.
*
* For pairs of observations, it validates them as poetry hemistichs based on
* width similarity, word count similarity, and overall centering when combined.
*
* Groups with more than 2 observations are not considered poetic content
* as they don't match common poetry formatting patterns.
*
* @param group - Array of observations to analyze (typically 1-2 items for poetry)
* @param imageWidth - Total width of the document/image in pixels
* @param avgProseWordDensity - Average word density of prose content for comparison
* @param options - Poetry detection configuration options
* @returns True if the group represents poetic content
*/
const isPoeticGroup = (group, imageWidth, avgProseWordDensity, options = DEFAULT_POETRY_OPTIONS) => {
const minWidthRatioForMerged = options.minWidthRatioForMerged ?? DEFAULT_MIN_WIDTH_RATIO_FOR_MERGED;
if (group.length === 1 && minWidthRatioForMerged !== null) return isWidePoeticLine(group[0], imageWidth, avgProseWordDensity, {
...options,
minWidthRatioForMerged
});
if (group.length === 2) return isPoetryPair(group[0], group[1], imageWidth, options);
return false;
};
//#endregion
//#region src/utils/paragraphs.ts
const DEFAULT_PARAGRAPH_OPTIONS = {
verticalJumpFactor: 2,
widthTolerance: .85
};
const isPoetryPairGroup = (group) => group.length === 2 && group.every((item) => item.isPoetic);
/**
* Preprocesses observations by filtering noise, flipping coordinates for RTL text,
* and normalizing x-coordinates for proper alignment.
*
* @param observations - Array of text observations to preprocess
* @param imageWidth - Total width of the document/image in pixels
* @param dpiX - Horizontal DPI for coordinate normalization
* @param options - Optional logging configuration
* @returns Preprocessed observations ready for line grouping
*/
const flipAndAlignObservations = (observations, imageWidth, dpiX, options = {}) => {
observations = observations.filter(filterNoisy