paragrafs
Version:
A lightweight TypeScript library designed to reconstruct paragraphs from AI transcriptions.
4 lines • 10.3 kB
JavaScript
const e=`SEGMENT_BREAK`,t=`ALWAYS_BREAK`,n=e=>/[.؟!?؛…]$/.test(e),r=e=>{let t=Math.floor(e/3600),n=Math.floor(e%3600/60),r=Math.floor(e%60);return t>0?`${t}:${n.toString().padStart(2,`0`)}:${r.toString().padStart(2,`0`)}`:`${n}:${r.toString().padStart(2,`0`)}`},i=e=>e.normalize(`NFD`).replace(/[\u200B-\u200D\uFEFF]/g,``).replace(/\p{Mn}/gu,``).replace(/[\u064B-\u065F]/g,``).replace(/^[\p{P}\p{S}\p{Cf}]+|[\p{P}\p{S}\p{Cf}]+$/gu,``).normalize(`NFC`),a=(e,t)=>{let n=e;t?.normalizeHamza&&(n=n.normalize(`NFD`).replace(/\u064A\p{Mn}*\u0654/gu,`ء`).replace(/\u0648\p{Mn}*\u0654/gu,`ء`).replace(/[\u0654\u0655]/g,``).normalize(`NFC`));let r=i(n);return t?.removeTatweel&&(r=r.replace(/\u0640/g,``)),t?.normalizeAlef&&(r=r.replace(/[أإآ]/g,`ا`)),t?.normalizeYa&&(r=r.replace(/ى/g,`ي`)),r},o={normalizeAlef:!0,normalizeHamza:!1,normalizeYa:!0,removeTatweel:!0},s=(e,...t)=>{let n={},[r,i]=typeof e==`string`?[o,[e,...t]]:[{...o,...e},t];for(let e of i){let t=e.split(/\s+/).map(e=>a(e,r)).filter(Boolean);if(t.length===0)continue;let i=t[0];n[i]||(n[i]=[]),n[i].push(t)}return{map:n,normalization:r}},c=e=>{let t=e.trim().split(/\s+/).map(e=>e.trim()).filter(Boolean),n=[];for(let e of t)n.length>0&&/^[\p{P}\p{S}]+$/u.test(e)?n[n.length-1]+=e:n.push(e);return n},l=(e,t)=>{let n=e.length,r=t.length,i=Array.from({length:n+1},()=>Array(r+1).fill(0));for(let a=0;a<n;a++)for(let n=0;n<r;n++)e[a]===t[n]?i[a+1][n+1]=i[a][n]+1:i[a+1][n+1]=Math.max(i[a][n+1],i[a+1][n]);return i},u=(e,t,n)=>{let r=new Map,i=t.length,a=n.length;for(;i>0&&a>0;)t[i-1]===n[a-1]?(r.set(i-1,a-1),i--,a--):e[i-1][a]>=e[i][a-1]?i--:a--;return r},d=(e,t,n)=>{if(n+t.length>e.length)return!1;for(let r=0;r<t.length;r++)if(e[n+r]!==t[r])return!1;return!0},f=(e,t,n)=>{let r=e[n],i=t.map[r];if(!i)return!1;for(let t of i)if(d(e,t,n))return!0;return!1},p=(e,{gtGap:t,gtGapIndex:n,nextToken:r,prevToken:i,tokenGap:a})=>{let o=i?.end??0,s=r.start,c=Math.max(0,s-o),l=t.length-a.length,u=l>0?c/l:0,d=o+(n-a.length)*u;return{end:d+u,start:d,text:e}},m=(e,t)=>{let n=e.map(e=>i(e.text)),r=t.map(i),a=u(l(n,r),n,r);return a.set(0,0),e.length>1&&t.length>1&&a.set(e.length-1,t.length-1),Array.from(a.entries()).sort((e,t)=>e[0]-t[0]).filter((e,t,n)=>!t||e[1]>n[t-1][1])},h=(e,t,n)=>{let r=[],i=-1,a=-1,o=(t,n,r,a)=>p(t[n],{gtGap:t,gtGapIndex:n,nextToken:e[a],prevToken:i===-1?null:e[i],tokenGap:r});for(let[s,c]of n){let n=e.slice(i+1,s),l=t.slice(a+1,c),u=0,d=0;for(;u<n.length||d<l.length;){if(u>=n.length){r.push(o(l,d,n,s)),d++;continue}if(d>=l.length){r.push({...n[u],isUnknown:!0}),u++;continue}r.push({...n[u],text:l[d]}),u++,d++}r.push({...e[s],text:t[c]}),i=s,a=c}return{lastGtIndex:a,lastTokenIndex:i,result:r}},g=(e,t,n,r,i)=>{let a=t.slice(r+1);if(!(n.slice(i+1).length>0))for(let t of a)e.push({...t,isUnknown:!0})},_=(e,t)=>{if(e.length===0)return[];let n=c(t);if(n.length===0)return e.map(e=>({...e,isUnknown:!0}));let{lastGtIndex:r,lastTokenIndex:i,result:a}=h(e,n,m(e,n));return g(a,e,n,i,r),a},v=({end:e,start:t,text:n})=>{let r=n.split(/\s+/),i=r.length,a=(e-t)/i;return{end:e,start:t,text:n,tokens:r.map((e,n)=>({end:t+(n+1)*a,start:t+n*a,text:e}))}},y=(r,{fillers:i=[],gapThreshold:o,hints:s})=>{let c=[],l=null,u=s?r.map(e=>a(e.text,s.normalization)):null;for(let a=0;a<r.length;a++){let d=r[a];if(i.includes(d.text)){c.push(e);continue}s&&u&&f(u,s,a)&&c.push(t),l!==null&&d.start-l>o&&c.push(e),c.push(d),n(d.text)&&c.push(e),l=d.end}return c},b=(n,r)=>{let i=[],a=[],o=null,s=null,c=()=>{a.length!==0&&(o===null||s===null||i.push({end:s,start:o,tokens:a}))},l=()=>{a=[],o=null,s=null},u=()=>o===null||s===null?!1:s-o>r;for(let r=0;r<n.length;r++){let i=n[r],d=n[r+1],f=d===e||d===t;if(i===t){c(),l(),a=[t];continue}i!==e&&(o===null&&(o=i.start),s=i.end),a.push(i),f&&u()&&(c(),l())}return c(),i},x=(n,r)=>{let i=[];for(let a of n){let n=a.tokens.filter(n=>n!==e&&n!==t);if(!a.tokens.includes(t)&&n.length<r&&i.length>0){let e=i[i.length-1];e.tokens.push(...a.tokens),e.end=a.end}else i.push({...a})}return i},S=(i,a,o)=>{let s=[],c=[],l=null,u=()=>{if(c.length===0)return;let e=c.map(e=>e.text).join(` `);s.push(o?o({end:c.at(-1).end,start:c[0].start,text:e}):`${r(c[0].start)}: ${e}`),c=[],l=null},d=()=>{if(c.length===0)return!1;let e=c[c.length-1].end;return(l===null?0:e-l)<a?!1:n(c[c.length-1].text)};for(let n of i.tokens){if(n===t){u();continue}if(n===e){d()&&u();continue}l===null&&(l=n.start),c.push(n)}return u(),s},C=(e,t,n)=>e.flatMap(e=>S(e,t,n)).join(`
`),ee=(n,r)=>n.map(n=>{let i=[],a=[],o=[],s=null,c=()=>{o.length>0&&(i.push(o.map(e=>e.text).join(` `)),o=[],s=null)},l=()=>{if(!r)return!0;if(o.length===0)return!1;let e=o[o.length-1].end;return(s===null?0:e-s)>r};for(let r of n.tokens){if(r===t){c();continue}if(r===e){l()&&c();continue}s===null&&(s=r.start),o.push(r),a.push(r)}return c(),{end:n.end,start:n.start,text:i.join(`
`),tokens:a}}),w=(e,t)=>{let n=y(e.flatMap(e=>e.tokens),{fillers:t.fillers,gapThreshold:t.gapThreshold,...t.hints&&{hints:t.hints}});return n=T(n),x(b(n,t.maxSecondsPerSegment),t.minWordsPerSegment)},T=n=>{let r=[];for(let i=0;i<n.length;i++){let a=n[i],o=n[i+1],s=n[i+2];a===e&&(o===t||o===e)||a===e&&(s===e||s===t||!s)||a===e&&r.at(-1)===e||r.push(a)}return r},E=(e,t)=>({end:e.end,start:e.start,text:t,tokens:_(e.tokens,t)}),D=(e,t)=>{let n=E(e,t);return{...n,tokens:n.tokens.filter(e=>!e.isUnknown)}},O=(e,t=` `)=>{let n=e.map(e=>e.text).join(t),r=e.flatMap(e=>e.tokens);return{end:e.at(-1).end,start:e[0].start,text:n,tokens:r}},k=(e,t)=>{let n=e.tokens.filter(e=>e.start<t),r=e.tokens.filter(e=>e.start>=t),i=n.map(e=>e.text).join(` `),a=r.map(e=>e.text).join(` `);return[{end:n.at(-1).end,start:e.start,text:i,tokens:n},{end:e.end,start:r[0].start,text:a,tokens:r}]},A=(e,t)=>{let n=s(t),r=e.map(e=>a(e.text,n.normalization));for(let t=0;t<e.length;t++)if(f(r,n,t))return e[t];return null},j=(e,t,n)=>{let{text:r,tokens:i}=e,a=0;for(let e of i){let i=r.indexOf(e.text,a);if(i===-1)continue;let o=i+e.text.length;if(a=o+1,t>=i&&n<=o)return e}return null},M={normalizeAlef:!0,normalizeHamza:!1,normalizeYa:!0,removeTatweel:!0},N={dedupe:`closed`,maxN:6,minCount:2,minN:2},P=e=>JSON.stringify(e),F=e=>JSON.parse(e),I=(e,t)=>t.length===0?!1:e.every(e=>t.includes(e)),L=(e,t=3)=>Array.from(e.entries()).sort((e,t)=>t[1]-e[1]||e[0].localeCompare(t[0])).slice(0,t).map(([e])=>e),R=(e,t)=>{let n=e.surfaceCounts.get(t);if(n!==void 0){e.surfaceCounts.set(t,n+1);return}e.surfaceCounts.size>=5||e.surfaceCounts.set(t,1)},z=(e,t)=>{e.occurrenceIndices.length<5e3?e.occurrenceIndices.push(t):e.occurrencesTruncated=!0},te=e=>!e.occurrencesTruncated,B=e=>new Set(e),V=(e,t)=>e.size===t.size&&Array.from(e).every(e=>t.has(e)),H=(e,t)=>{let n=t.get(e);if(n!==void 0)return n;let r=F(e).length;return t.set(e,r),r},U=(e,t)=>Array.from(e.entries()).sort((e,n)=>{let r=H(e[0],t);return H(n[0],t)-r||n[1].count-e[1].count}),W=(e,t)=>{let n=new Set;for(let r of e)n.add(r+t);return n},G=(e,t,n,r,i)=>{let a=P(n.slice(r,r+i)),o=e.get(a);return!o||o.count!==t.count?null:V(W(B(t.occurrenceIndices),r),B(o.occurrenceIndices))?a:null},K=e=>{let t=new Set,n=U(e,new Map);for(let[r,i]of n){if(!te(i))continue;let n=F(r),a=n.length;for(let r=2;r<a;r++)for(let o=0;o+r<=a;o++){let a=G(e,i,n,o,r);a&&t.add(a)}}return t},q=e=>({dedupe:e?.dedupe??N.dedupe,maxN:e?.maxN??N.maxN,minCount:e?.minCount??N.minCount,minN:e?.minN??N.minN,normalization:{...M,...e?.normalization??{}},stopwords:e?.stopwords??[],topK:e?.topK??1/0}),J=(e,t)=>e.map(e=>a(e.text,t.normalization)),Y=(e,t)=>{let n=new Map;for(let r=0;r<e.length;r++)for(let i=t.minN;i<=t.maxN&&!(r+i>e.length);i++){let a=e.slice(r,r+i);if(a.some(e=>!e)||I(a,t.stopwords))continue;let o=P(a);n.set(o,(n.get(o)??0)+1)}return n},X=(e,t)=>{let n=new Set;for(let[r,i]of e)i>=t&&n.add(r);return n},Z=(e,t)=>{let n=new Map;for(let r of t)n.set(r,{count:e.get(r),firstOccurrenceIndex:1/0,occurrenceIndices:[],occurrencesTruncated:!1,surfaceCounts:new Map});return n},Q=(e,t,n,r,i)=>{let a=Z(i,r),o=(n,i)=>{let o=t.slice(n,n+i);if(o.some(e=>!e))return;let s=P(o);if(!r.has(s))return;let c=a.get(s);c.firstOccurrenceIndex=Math.min(c.firstOccurrenceIndex,n),z(c,n),R(c,e.slice(n,n+i).map(e=>e.text).join(` `))};for(let e=0;e<t.length;e++)for(let r=n.minN;r<=n.maxN&&!(e+r>t.length);r++)o(e,r);return a},ne=(e,t,n)=>{let r=[];for(let[n,i]of e){if(t.has(n))continue;let e=F(n),a=e.join(` `),o=L(i.surfaceCounts,3),s=o[0]??a;r.push({count:i.count,firstOccurrenceIndex:Number.isFinite(i.firstOccurrenceIndex)?i.firstOccurrenceIndex:void 0,length:e.length,normalizedPhrase:a,phrase:s,topSurfaceForms:o.length>0?o:void 0})}return r.sort((e,t)=>t.count-e.count||t.length-e.length||e.normalizedPhrase.localeCompare(t.normalizedPhrase)),r.slice(0,Math.max(0,n))},$=(e,t)=>{let n=q(t);if(e.length===0||n.minN<1||n.maxN<n.minN)return[];let r=J(e,n),i=Y(r,n),a=X(i,n.minCount);if(a.size===0)return[];let o=Q(e,r,n,a,i);return ne(o,n.dedupe===`closed`?K(o):new Set,n.topK)},re=(e,t)=>{if((t?.boundaryStrategy??`segment`)===`none`)return $(e.flatMap(e=>e.tokens),t);let n=(e,t)=>{let n=e.get(t.normalizedPhrase);if(!n){e.set(t.normalizedPhrase,{...t});return}n.count+=t.count,n.length=Math.max(n.length,t.length),n.topSurfaceForms=Array.from(new Set([...n.topSurfaceForms??[],...t.topSurfaceForms??[]])).slice(0,3),t.firstOccurrenceIndex!==void 0&&(n.firstOccurrenceIndex=n.firstOccurrenceIndex===void 0?t.firstOccurrenceIndex:Math.min(n.firstOccurrenceIndex,t.firstOccurrenceIndex))},r=new Map;for(let i of e){let e=$(i.tokens,t);for(let t of e)n(r,t)}return Array.from(r.values()).sort((e,t)=>t.count-e.count||t.length-e.length||e.normalizedPhrase.localeCompare(t.normalizedPhrase))};export{D as applyGroundTruthToSegment,T as cleanupIsolatedTokens,s as createHints,v as estimateSegmentFromToken,r as formatSecondsToTimestamp,C as formatSegmentsToTimestampedTranscript,re as generateHintsFromSegments,$ as generateHintsFromTokens,A as getFirstMatchingToken,j as getFirstTokenForSelection,b as groupMarkedTokensIntoSegments,n as isEndingWithPunctuation,ee as mapSegmentsIntoFormattedSegments,w as markAndCombineSegments,y as markTokensWithDividers,O as mergeSegments,x as mergeShortSegmentsWithPrevious,a as normalizeTokenText,i as normalizeWord,k as splitSegment,c as tokenizeGroundTruth,E as updateSegmentWithGroundTruth};
//# sourceMappingURL=index.mjs.map