baburchi
Version:
A lightweight TypeScript library designed to fix typos in OCR post-processing.
3 lines • 9.84 kB
JavaScript
var I="\u0627\u0647\u0640",u={arabicCharacters:/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,arabicDigits:/[0-9\u0660-\u0669]+/,arabicFootnoteReferenceRegex:/^\([\u0660-\u0669]+\)/g,arabicLettersAndDigits:/[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,arabicPunctuationAndWhitespace:/[\s\u060C\u061B\u061F\u06D4]+/,arabicReferenceRegex:/\([\u0660-\u0669]+\)/g,diacritics:/[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]/g,footnoteEmbedded:/\([0-9\u0660-\u0669]+\)/,footnoteStandalone:/^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,invalidReferenceRegex:/\(\)|\([.1OV9]+\)/g,ocrConfusedFootnoteReferenceRegex:/^\([.1OV9]+\)/g,ocrConfusedReferenceRegex:/\([.1OV9]+\)/g,tatweel:/\u0640/g,whitespace:/\s+/},g=e=>e.replace(u.tatweel,"").replace(u.diacritics,"").trim(),T=e=>{let t=e.match(u.arabicDigits);return t?t[0]:""},A=(e,t=[])=>{let r=e;for(let n of t){let o=new RegExp(n,"g");r=r.replace(o,` ${n} `)}return r.trim().split(u.whitespace).filter(Boolean)},S=(e,t,r)=>{let n=u.footnoteStandalone.test(t),o=u.footnoteEmbedded.test(r),s=u.footnoteStandalone.test(r),a=u.footnoteEmbedded.test(t),i=T(t),c=T(r);return n&&o&&i===c?(e[e.length-1]=r,!0):!!(a&&s&&i===c)},E=(e,t)=>{let r=u.footnoteEmbedded.test(e),n=u.footnoteEmbedded.test(t);return r&&!n?[e]:n&&!r?[t]:r&&n?[e.length<=t.length?e:t]:null},N=(e,t)=>{let r=u.footnoteStandalone.test(e),n=u.footnoteStandalone.test(t);return r&&!n?[e,t]:n&&!r?[t,e]:r&&n?[e.length<=t.length?e:t]:null},oe=e=>e.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu,"$1 \u0647\u0640"),se=e=>e.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/g,`$1${I}`);var h={GAP_PENALTY:-1,MISMATCH_PENALTY:-2,PERFECT_MATCH:2,SOFT_MATCH:1},$=(e,t)=>{let r=e.length,n=t.length;if(r===0)return n;if(n===0)return r;let[o,s]=r<=n?[e,t]:[t,e],a=o.length,i=s.length,c=Array.from({length:a+1},(f,d)=>d);for(let f=1;f<=i;f++){let d=[f];for(let p=1;p<=a;p++){let C=s[f-1]===o[p-1]?0:1,l=Math.min(c[p]+1,d[p-1]+1,c[p-1]+C);d.push(l)}c=d}return c[a]},x=(e,t)=>{let r=Math.max(e.length,t.length)||1,n=$(e,t);return(r-n)/r},R=(e,t,r=.6)=>{let n=g(e),o=g(t);return x(n,o)>=r},v=(e,t,r,n)=>{let o=g(e),s=g(t);if(o===s)return h.PERFECT_MATCH;let a=r.includes(e)||r.includes(t),i=x(o,s)>=n;return a||i?h.SOFT_MATCH:h.MISMATCH_PENALTY},_=(e,t,r)=>{let n=[],o=t.length,s=r.length;for(;o>0||s>0;)switch(e[o][s].direction){case"diagonal":n.push([t[--o],r[--s]]);break;case"left":n.push([null,r[--s]]);break;case"up":n.push([t[--o],null]);break;default:throw new Error("Invalid alignment direction")}return n.reverse()},B=(e,t,r,n)=>{let o=e.length,s=t.length,a=Array.from({length:o+1},()=>Array.from({length:s+1},()=>({direction:null,score:0})));for(let i=1;i<=o;i++)a[i][0]={direction:"up",score:i*h.GAP_PENALTY};for(let i=1;i<=s;i++)a[0][i]={direction:"left",score:i*h.GAP_PENALTY};for(let i=1;i<=o;i++)for(let c=1;c<=s;c++){let f=v(e[i-1],t[c-1],r,n),d=a[i-1][c-1].score+f,p=a[i-1][c].score+h.GAP_PENALTY,C=a[i][c-1].score+h.GAP_PENALTY,l=Math.max(d,p,C),m="left";l===d?m="diagonal":l===p&&(m="up"),a[i][c]={direction:m,score:l}}return _(a,e,t)};var fe=(e,t)=>{let r=[],n=0;for(let o of e){if(n>=t.length)break;if(o){let{result:s,segmentsConsumed:a}=D(o,t,n);s&&r.push(s),n+=a}else r.push(t[n]),n++}return n<t.length&&r.push(...t.slice(n)),r},q=(e,t,r)=>{let n=`${t} ${r}`,o=`${r} ${t}`,s=g(e),a=x(s,g(n)),i=x(s,g(o));return a>=i?n:o},D=(e,t,r)=>{let n=t[r];if(R(e,n))return{result:n,segmentsConsumed:1};let o=t[r],s=t[r+1];return!o||!s?o?{result:o,segmentsConsumed:1}:{result:"",segmentsConsumed:0}:{result:q(e,o,s),segmentsConsumed:2}};var P=e=>{let t=[],r=0,n=-1;for(let s=0;s<e.length;s++)e[s]==='"'&&(r++,n=s);let o=r%2===0;return!o&&n!==-1&&t.push({char:'"',index:n,reason:"unmatched",type:"quote"}),{errors:t,isBalanced:o}},H={"\xAB":"\xBB","(":")","[":"]","{":"}"},L=new Set(["\xAB","(","[","{"]),V=new Set(["\xBB",")","]","}"]),M=e=>{let t=[],r=[];for(let n=0;n<e.length;n++){let o=e[n];if(L.has(o))r.push({char:o,index:n});else if(V.has(o)){let s=r.pop();s?H[s.char]!==o&&(t.push({char:s.char,index:s.index,reason:"mismatched",type:"bracket"}),t.push({char:o,index:n,reason:"mismatched",type:"bracket"})):t.push({char:o,index:n,reason:"unmatched",type:"bracket"})}}return r.forEach(({char:n,index:o})=>{t.push({char:n,index:o,reason:"unclosed",type:"bracket"})}),{errors:t,isBalanced:t.length===0}},O=e=>{let t=P(e),r=M(e);return{errors:[...t.errors,...r.errors].sort((n,o)=>n.index-o.index),isBalanced:t.isBalanced&&r.isBalanced}},ge=e=>{let t=[],r=e.split(`
`),n=0;return r.forEach((o,s)=>{if(o.length>10){let a=O(o);a.isBalanced||a.errors.forEach(i=>{t.push({absoluteIndex:n+i.index,char:i.char,reason:i.reason,type:i.type})})}n+=o.length+(s<r.length-1?1:0)}),t},me=e=>P(e).isBalanced,pe=e=>M(e).isBalanced,be=e=>O(e).isBalanced;var j="()",Y=e=>u.invalidReferenceRegex.test(e),G=new Intl.NumberFormat("ar-SA"),K=e=>G.format(e),F=e=>({1:"\u0661",9:"\u0669",".":"\u0660",O:"\u0665",o:"\u0665",V:"\u0667",v:"\u0667"})[e]||e,Z=e=>{let t={"\u0660":"0","\u0661":"1","\u0662":"2","\u0663":"3","\u0664":"4","\u0665":"5","\u0666":"6","\u0667":"7","\u0668":"8","\u0669":"9"},r=e.replace(/[()]/g,""),n="";for(let s of r)n+=t[s];let o=parseInt(n,10);return isNaN(o)?0:o},w=e=>{let t=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(u.arabicReferenceRegex)||[]),r=e.filter(i=>!i.isFootnote).flatMap(i=>i.text.match(u.ocrConfusedReferenceRegex)||[]),n=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(u.arabicFootnoteReferenceRegex)||[]),o=e.filter(i=>i.isFootnote).flatMap(i=>i.text.match(u.ocrConfusedFootnoteReferenceRegex)||[]),s=r.map(i=>i.replace(/[.1OV9]/g,c=>F(c))),a=o.map(i=>i.replace(/[.1OV9]/g,c=>F(c)));return{bodyReferences:[...t,...s],footnoteReferences:[...n,...a],ocrConfusedInBody:r,ocrConfusedInFootnotes:o}},Q=(e,t)=>{if(e.some(s=>Y(s.text)))return!0;let n=new Set(t.bodyReferences),o=new Set(t.footnoteReferences);if(n.size!==o.size)return!0;for(let s of n)if(!o.has(s))return!0;return!1},Ce=e=>{let t=w(e);if(!Q(e,t))return e;let r=e.map(l=>{let m=l.text,y=/\([.1OV9]+\)/g;return m=m.replace(y,b=>b.replace(/[.1OV9]/g,z=>F(z))),{...l,text:m}}),n=w(r),o=new Set(n.bodyReferences),s=new Set(n.footnoteReferences),a=[...new Set(n.bodyReferences)],i=[...new Set(n.footnoteReferences)],c=a.filter(l=>!s.has(l)),f=i.filter(l=>!o.has(l)),d=[...o,...s],C={count:(d.length>0?Math.max(0,...d.map(l=>Z(l))):0)+1};return r.map(l=>{if(!l.text.includes(j))return l;let m=l.text;return m=m.replace(/\(\)/g,()=>{if(l.isFootnote){let b=c.shift();if(b)return b}else{let b=f.shift();if(b)return b}let y=`(${K(C.count)})`;return C.count++,y}),{...l,text:m}})};var Ae=e=>{if(!e||e.trim().length===0)return!0;let t=e.trim(),r=t.length;if(r<2||J(t))return!0;let n=U(t);if(W(n,r))return!0;let o=u.arabicCharacters.test(t);return!o&&/[a-zA-Z]/.test(t)?!0:o?!ee(n,r):X(n,r,t)};function U(e){let t={arabicCount:0,charFreq:new Map,digitCount:0,latinCount:0,punctuationCount:0,spaceCount:0,symbolCount:0},r=Array.from(e);for(let n of r)t.charFreq.set(n,(t.charFreq.get(n)||0)+1),u.arabicCharacters.test(n)?t.arabicCount++:/\d/.test(n)?t.digitCount++:/[a-zA-Z]/.test(n)?t.latinCount++:/\s/.test(n)?t.spaceCount++:/[.,;:()[\]{}"""''`]/.test(n)?t.punctuationCount++:t.symbolCount++;return t}function W(e,t){let r=0,n=["!",".","-","=","_"];for(let[o,s]of e.charFreq)s>=5&&n.includes(o)&&(r+=s);return r/t>.4}function J(e){return[/^[-=_━≺≻\s]*$/,/^[.\s]*$/,/^[!\s]*$/,/^[A-Z\s]*$/,/^[-\d\s]*$/,/^\d+\s*$/,/^[A-Z]\s*$/,/^[—\s]*$/,/^[्र\s-]*$/].some(r=>r.test(e))}function X(e,t,r){let n=e.arabicCount+e.latinCount+e.digitCount;return n===0||k(e,n,t)?!0:/[٠-٩]/.test(r)&&e.digitCount>=3?!1:(e.symbolCount+Math.max(0,e.punctuationCount-5))/Math.max(n,1)>2||t<=5&&e.arabicCount===0&&!(/^\d+$/.test(r)&&e.digitCount>=3)?!0:/^\d{3,4}$/.test(r)?!1:t<=10}function k(e,t,r){let{arabicCount:n,spaceCount:o}=e;return o>0&&t===o+1&&t<=5||r<=10&&o>=2&&n===0||o/r>.6}function ee(e,t){return e.arabicCount>=3||e.arabicCount>=1&&e.digitCount>0&&t<=20||e.arabicCount>=2&&e.punctuationCount<=2&&t<=10||e.arabicCount>=1&&t<=5&&e.punctuationCount<=1}var te=(e,t,{similarityThreshold:r,typoSymbols:n})=>{if(e===null)return[t];if(t===null)return[e];if(g(e)===g(t))return[e];let o=E(e,t);if(o)return o;let s=N(e,t);if(s)return s;if(n.includes(e)||n.includes(t)){let f=n.find(d=>d===e||d===t);return f?[f]:[e]}let a=g(e),i=g(t);return[x(a,i)>r?e:t]},ne=(e,t)=>{if(e.length===0)return e;let r=[];for(let n of e){if(r.length===0){r.push(n);continue}let o=r.at(-1);if(R(o,n,t)){n.length<o.length&&(r[r.length-1]=n);continue}S(r,o,n)||r.push(n)}return r},re=(e,t,r)=>{let n=A(e,r.typoSymbols),o=A(t,r.typoSymbols),a=B(n,o,r.typoSymbols,r.similarityThreshold).flatMap(([c,f])=>te(c,f,r));return ne(a,r.highSimilarityThreshold).join(" ")},Ee=(e,t,{highSimilarityThreshold:r=.8,similarityThreshold:n=.6,typoSymbols:o})=>re(e,t,{highSimilarityThreshold:r,similarityThreshold:n,typoSymbols:o});export{H as BRACKETS,V as CLOSE_BRACKETS,I as INTAHA_ACTUAL,L as OPEN_BRACKETS,u as PATTERNS,fe as alignTextSegments,B as alignTokenSequences,U as analyzeCharacterStats,pe as areBracketsBalanced,me as areQuotesBalanced,R as areSimilarAfterNormalization,_ as backtrackAlignment,v as calculateAlignmentScore,$ as calculateLevenshteinDistance,x as calculateSimilarity,O as checkBalance,Ce as correctReferences,T as extractDigits,Ee as fixTypo,ge as getUnbalancedErrors,S as handleFootnoteFusion,E as handleFootnoteSelection,N as handleStandaloneFootnotes,W as hasExcessiveRepetition,Y as hasInvalidFootnotes,Ae as isArabicTextNoise,be as isBalanced,J as isBasicNoisePattern,X as isNonArabicNoise,k as isSpacingNoise,ee as isValidArabicContent,g as normalizeArabicText,re as processTextAlignment,oe as standardizeHijriSymbol,se as standardizeIntahaSymbol,A as tokenizeText};
//# sourceMappingURL=index.js.map