UNPKG

kokokor

Version:

A lightweight TypeScript library designed to reconstruct paragraphs from OCRed inputs.

3 lines 6.45 kB
var m=(t,e,n)=>{let o=e/2,i=e*n.centerToleranceRatio,r=t.x+t.width/2,x=Math.abs(r-o)<=i,c=t.x,s=e-(t.x+t.width),a=e*n.minMarginRatio,b=c>=a&&s>=a;return x&&b},O=(t,e,n=5)=>e.filter(o=>!t.some(i=>h(o,i,n))),y=(t,e,n=5)=>(t.length>0&&e.length>0&&(e=O(t,e,n)),e=e.filter(o=>o.y>n),e.at(-1)?.y),h=(t,e,n)=>{let o=e.x-n,i=e.x+e.width+n,r=e.y-n,x=e.y+e.height+n,c=t.x,s=t.x+t.width,a=t.y,b=t.y+t.height;return c>=o&&s<=i&&a>=r&&b<=x},W=t=>{let[e,n,o,i]=t;return{height:i-n,width:o-e,x:e,y:n}},B=t=>{let e=t.length;if(e<3)return{minIntraLineGap:0,typicalGap:0};let n=new Array(e-1);for(let s=1;s<e;s++)n[s-1]=t[s].bbox.y-t[s-1].bbox.y;n.sort((s,a)=>s-a);let o=Math.floor(n.length*.5),i=Math.floor(n.length*.75),r=n[o],x=n[i];return{minIntraLineGap:Math.min(r*.6,x*.4),typicalGap:x}},P=(t,e)=>{if(t.length===0)return .3;let n=0;for(let r of t)n+=r;let o=n/t.length,i=e/o;return i<.8?.15:i<1.2?.25:.4};var p={centerToleranceRatio:.05,maxVerticalGapRatio:2,minMarginRatio:.1,minWidthRatioForMerged:.6,minWordCount:2,pairWidthSimilarityRatio:.4,pairWordCountSimilarityRatio:.5,wordDensityComparisonRatio:.95},R={centerToleranceRatio:.05,horizontalLines:[],minMarginRatio:.2,pixelTolerance:5,poetryDetectionOptions:p,rectangles:[]},L=25,w=/[،,؛;؟?۔.:()]/;var U=(t,e,n,o,i)=>{let r=e.bbox.y-t.bbox.y,x=(t.bbox.height+e.bbox.height)*.5,s=x*n+o,a=r>s;if(!a&&i.minIntraLineGap>0&&r>i.minIntraLineGap){let b=Math.min(x*.2,i.minIntraLineGap);a=r>b}return a},X=(t,e,n,o)=>{let i=t.length,r=new Array(i),x=0,c=t[0];r[0]={...c,index:x};for(let s=1;s<i;s++){let a=t[s];U(c,a,e,n,o)&&(x+=1),r[s]={...a,index:x},c=a}return r},M=(t,e,n,o)=>{let i=t.toSorted((a,b)=>a.bbox.y-b.bbox.y),r=n*(e/72),x=o?{minIntraLineGap:0,typicalGap:0}:B(i),c=o||P(t.map(a=>a.bbox.height),x.typicalGap);return X(i,c,r,x).toSorted((a,b)=>a.index!==b.index?a.index-b.index:a.bbox.y-b.bbox.y)},z=(t,e)=>{let n=t.width/(e.width/72),o=t.height/(e.height/72);return{x:n,y:o}},v=(t,e,n)=>{if(t.length===0)return[];let i=Math.max(...t.map(c=>c.bbox.width))*n,r=[],x=0;for(let c=0;c<t.length;c++){let s=t[c];if(c>1){let a=t[c-1],b=t[c-2];if(a.bbox.width>=i&&b.bbox.width>=i){let l=s.bbox.y-a.bbox.y,u=a.bbox.y-b.bbox.y;(u>0&&l>u*e||u===0&&l>0&&l>s.bbox.height*.5*e)&&x++}}else if(c===1){let a=t[c-1];a.bbox.width>=i&&s.bbox.y-a.bbox.y>a.bbox.height*e&&x++}r.push({...s,index:x}),s.bbox.width<i&&x++}return r.sort((c,s)=>c.index!==s.index?c.index-s.index:c.bbox.y-s.bbox.y)};var f=t=>{let e=[];for(let{index:n,...o}of t)e[n]||(e[n]=[]),e[n].push(o);return e},D=t=>{let e=t.slice();for(let n=0;n<e.length;n++){let o=e[n];e[n]=o.toSorted((i,r)=>i.bbox.x-r.bbox.x)}return e},g=t=>{let e=[];for(let n of t){if(n.length===1){e.push(n[0]);continue}let o=n[0].bbox.x,i=n[0].bbox.y,r=n[0].bbox.x+n[0].bbox.width,x=n[0].bbox.y+n[0].bbox.height,c=n[0].text;for(let s=1;s<n.length;s++){let{bbox:a,text:b}=n[s];o=Math.min(o,a.x),i=Math.min(i,a.y),r=Math.max(r,a.x+a.width),x=Math.max(x,a.y+a.height),c+=" "+b}e.push({...n[0],bbox:{height:x-i,width:r-o,x:o,y:i},text:c})}return e};var S=(t,e)=>t.map(n=>({...n,bbox:{...n.bbox,x:e-n.bbox.x-n.bbox.width}})),C=t=>t.text?.replace(/[،,؛;؟?۔.:\-()]/g,"").length>1,I=(t,e,n=300)=>{let o=n/e*5,i=Math.min(...t.map(r=>r.bbox.x));return t.map(r=>Math.abs(r.bbox.x-i)<=o?{...r,bbox:{...r.bbox,x:i}}:r)},G=(t,e=!1)=>t.map(n=>({bbox:{height:Math.trunc(n.bbox.height),width:Math.trunc(n.bbox.width),x:Math.trunc(n.bbox.x),y:Math.trunc(n.bbox.y)},text:e?n.text.split(" ").filter(o=>o.length>1).slice(0,1).join(" "):n.text}));var _=(t,e,n=p)=>{let o=0,i=0;for(let r of t){let x=r.text.split(" ").length;!m(r.bbox,e,n)&&r.bbox.width>e*.4&&x>=n.minWordCount&&x<=L&&(o+=x,i+=r.bbox.width)}return o>0&&i>0?o/i:0},Y=(t,e,n,o=p)=>{let i=t.text.split(" ").length,r=e.text.split(" ").length;if(i<o.minWordCount||r<o.minWordCount)return!1;let x=(t.bbox.width+e.bbox.width)/2;if(Math.abs(t.bbox.width-e.bbox.width)/x>=o.pairWidthSimilarityRatio)return!1;let s=Math.max(i,r);if(Math.abs(i-r)/s>=o.pairWordCountSimilarityRatio)return!1;let b=t.bbox.x<e.bbox.x?t:e,u=(t.bbox.x<e.bbox.x?e:t).bbox.x-(b.bbox.x+b.bbox.width),N=u>n*.07||u>x*.15?{...o,centerToleranceRatio:o.centerToleranceRatio*2.5,minMarginRatio:o.minMarginRatio*.75}:o,T=Math.min(t.bbox.x,e.bbox.x),H=Math.max(t.bbox.x+t.bbox.width,e.bbox.x+e.bbox.width),F={height:Math.max(t.bbox.y+t.bbox.height,e.bbox.y+e.bbox.height)-Math.min(t.bbox.y,e.bbox.y),width:H-T,x:T,y:Math.min(t.bbox.y,e.bbox.y)};return m(F,n,N)},V=(t,e,n,o=p)=>{let i=t.text.split(" ").length;if(i<o.minWordCount||w.test(t.text)||!m(t.bbox,e,o))return!1;if(t.bbox.width>e*o.minWidthRatioForMerged){if(n<=0)return!1;let r=i/t.bbox.width;if(r>0){let x=r/n,s=t.bbox.width/e>.75?o.wordDensityComparisonRatio*.95:.5;if(x<s)return!0}}return!1},A=(t,e,n,o=p)=>t.length===1&&o.minWidthRatioForMerged!==null?V(t[0],e,n,o):t.length===2?Y(t[0],t[1],e,o):!1;var E=(t,e,n,o={})=>(t=t.filter(C),t.length===0?[]:(o.log&&o.log("mapOcrResultToRTLObservations",t,e),t=S(t,e),o.log&&o.log("normalizeObservationsX",t,n),I(t,n))),j=(t,e,n)=>{let o={...R,...n,poetryDetectionOptions:{...p,...n?.poetryDetectionOptions||{}}};if(t=E(t,e.width,e.x,o),t.length===0)return[];o.log&&o.log("indexObservationsAsLines",t,e.y,o.pixelTolerance,o.lineHeightFactor);let i=y(o.rectangles||[],o.horizontalLines||[],o.pixelTolerance),r=_(t,e.width,o.poetryDetectionOptions),x=M(t,e.y,o.pixelTolerance,o.lineHeightFactor).map(s=>{let a={...s};return o.rectangles?.some(l=>h(s.bbox,l,o.pixelTolerance))&&(a.isHeading=!0),m(s.bbox,e.width,o)&&(a.isCentered=!0),i!==void 0&&s.bbox.y>i&&(a.isFootnote=!0),a}),c=f(x);o.log&&o.log("sortGroupsHorizontally",c),c=D(c),o.log&&o.log("isPoeticGroup",c.map(s=>G(s)),r,o.poetryDetectionOptions);for(let s of c)if(A(s,e.width,r,o.poetryDetectionOptions))for(let a of s)a.isPoetic=!0;return g(c)},k=(t,e,n)=>{let o=[],i=[],r=()=>{let x=v(i,e,n),c=f(x);o.push(...g(c))};for(let x of t)x.isPoetic?(i.length>0&&(r(),i.length=0),o.push(x)):i.push(x);return i.length>0&&r(),o},q=(t,e=2,n=.85)=>{let o=k(t.filter(r=>!r.isFootnote),e,n),i=k(t.filter(r=>r.isFootnote),e,n);return o.concat(i)};var ut=(t,e)=>{let n=!1;return t.flatMap(i=>e&&i.isFootnote&&!n?(n=!0,[e,i.text]):i.isHeading?[i.text,""]:[i.text]).join(` `)};export{z as calculateDPI,O as filterHorizontalLinesOutsideRectangles,E as flipAndAlignObservations,ut as formatTextBlocks,W as mapMatrixToBoundingBox,j as mapObservationsToTextLines,q as mapTextLinesToParagraphs}; //# sourceMappingURL=index.js.map