UNPKG

paragrafs

Version:

A lightweight TypeScript library designed to reconstruct paragraphs from AI transcriptions.

4 lines 5.77 kB
var c="SEGMENT_BREAK",g="ALWAYS_BREAK";var d=e=>/[.؟!?]$/.test(e),T=e=>{let l=Math.floor(e/3600),s=Math.floor(e%3600/60),n=Math.floor(e%60);return l>0?`${l}:${s.toString().padStart(2,"0")}:${n.toString().padStart(2,"0")}`:`${s}:${n.toString().padStart(2,"0")}`},h=e=>e.normalize("NFD").replace(/\p{Mn}/gu,"").replace(/[\u064B-\u065F]/g,"").replace(/^[\p{P}\p{S}]+|[\p{P}\p{S}]+$/gu,"").normalize("NFC"),x=(...e)=>{let l={};for(let s of e){let n=s.split(" "),r=n[0];l[r]||(l[r]=[]),l[r].push(n)}return l};var S=(e,l)=>{let s=e.length,n=l.length,r=Array.from({length:s+1},()=>Array(n+1).fill(0));for(let t=0;t<s;t++)for(let o=0;o<n;o++)e[t]===l[o]?r[t+1][o+1]=r[t][o]+1:r[t+1][o+1]=Math.max(r[t][o+1],r[t+1][o]);return r},b=(e,l,s)=>{let n=new Map,r=l.length,t=s.length;for(;r>0&&t>0;)l[r-1]===s[t-1]?(n.set(r-1,t-1),r--,t--):e[r-1][t]>=e[r][t-1]?r--:t--;return n};var k=(e,l,s)=>{let n=e[s],r=l[n.text];if(r)for(let t of r){let o=t.length;if(s+o<=e.length){let i=!0;for(let u=0;u<o;u++)if(e[s+u].text!==t[u]){i=!1;break}if(i)return!0}}return!1},G=(e,{gtGap:l,gtGapIndex:s,nextToken:n,prevToken:r,tokenGap:t})=>{let o=r?.end??0,i=n.start,u=Math.max(0,i-o),a=l.length-t.length,m=a>0?u/a:0,p=s-t.length,f=o+p*m;return{end:f+m,start:f,text:e}},I=(e,l)=>{let s=e.map(o=>h(o.text)),n=l.map(h),r=S(s,n),t=b(r,s,n);return t.set(0,0),e.length>1&&l.length>1&&t.set(e.length-1,l.length-1),Array.from(t.entries()).sort((o,i)=>o[0]-i[0]).filter((o,i,u)=>!i||o[1]>u[i-1][1])},A=(e,l,s)=>{let n=[],r=-1,t=-1;for(let[o,i]of s){let u=e.slice(r+1,o),a=l.slice(t+1,i),m=0,p=0;for(;m<u.length||p<a.length;)if(m<u.length&&p<a.length)n.push({...u[m],text:a[p]}),m++,p++;else if(m<u.length)n.push({...u[m],isUnknown:!0}),m++;else{let f=G(a[p],{gtGap:a,gtGapIndex:p,nextToken:e[o],prevToken:r===-1?null:e[r],tokenGap:u});n.push(f),p++}n.push({...e[o],text:l[i]}),r=o,t=i}return{lastGtIndex:t,lastTokenIndex:r,result:n}},E=(e,l,s,n,r)=>{let t=l.slice(n+1),o=s.slice(r+1),i=0,u=0;for(;i<t.length||u<o.length;)if(i<t.length&&u<o.length)e.push({...t[i],text:o[u]}),i++,u++;else if(i<t.length)e.push({...t[i],isUnknown:!0}),i++;else break},M=(e,l)=>{if(e.length===0)return[];let s=l.trim().match(/[\w\u0600-\u06FF]+[؟،.]?|\S+/g)||[];if(s.length===0)return e.map(i=>({...i,isUnknown:!0}));let n=I(e,s),{lastGtIndex:r,lastTokenIndex:t,result:o}=A(e,s,n);return E(o,e,s,t,r),o};var R=({end:e,start:l,text:s})=>{let n=s.split(/\s+/),r=n.length,o=(e-l)/r,i=n.map((u,a)=>({end:l+(a+1)*o,start:l+a*o,text:u}));return{end:e,start:l,text:s,tokens:i}},W=(e,{fillers:l=[],gapThreshold:s,hints:n})=>{let r=[],t=null;for(let o=0;o<e.length;o++){let i=e[o];if(l.includes(i.text)){r.push(c);continue}n&&k(e,n,o)&&r.push(g),t!==null&&i.start-t>s&&r.push(c),r.push(i),d(i.text)&&r.push(c),t=i.end}return r},w=(e,l)=>{let s=[],n=[],r=null,t=null;for(let o=0;o<e.length;o++){let i=e[o];i!==c&&i!==g&&(r===null&&(r=i.start),t=i.end),n.push(i);let u=r!==null&&t!==null?t-r:0,a=e[o+1]===c||e[o+1]===g;u>l&&a&&(s.push({end:t,start:r,tokens:n}),n=[],r=null,t=null)}return n.length>0&&r!==null&&t!==null&&s.push({end:t,start:r,tokens:n}),s},y=(e,l)=>{let s=[];for(let n of e)if(n.tokens.filter(t=>t!==c&&t!==g).length<l&&s.length>0){let t=s[s.length-1];t.tokens.push(...n.tokens),t.end=n.end}else s.push({...n});return s},_=(e,l,s)=>{let n=[];for(let r of e){let t=[],o=null,i=()=>{if(t.length>0){let u=t.map(a=>a.text).join(" ");n.push(s?s({end:t.at(-1).end,start:t[0].start,text:u}):`${T(t[0].start)}: ${u}`),t=[],o=null}};for(let u=0;u<r.tokens.length;u++){let a=r.tokens[u];if(a===g)i();else if(a===c){let m=t.length>0?t[t.length-1].end:null;(o!==null&&m!==null?m-o:0)>=l&&t.length>0&&d(t[t.length-1].text)&&i()}else o===null&&(o=a.start),t.push(a)}i()}return n.join(` `)},N=(e,l)=>e.map(s=>{let n=[],r=[],t=[],o=null,i=()=>{t.length>0&&(n.push(t.map(u=>u.text).join(" ")),t=[],o=null)};for(let u of s.tokens)if(u===g)i();else if(u===c)if(!l)i();else{let a=t.length>0?t[t.length-1].end:null;(o!==null&&a!==null?a-o:0)>l&&i()}else o===null&&(o=u.start),t.push(u),r.push(u);return i(),{end:s.end,start:s.start,text:n.join(` `),tokens:r}}),O=(e,l)=>{let s=e.flatMap(o=>o.tokens),n=W(s,{fillers:l.fillers,gapThreshold:l.gapThreshold,...l.hints&&{hints:l.hints}});n=P(n);let r=w(n,l.maxSecondsPerSegment);return y(r,l.minWordsPerSegment)},P=e=>{let l=[];for(let s=0;s<e.length;s++){let n=e[s],r=e[s+1],t=e[s+2];n===c&&(r===g||r===c)||n===c&&(t===c||t===g||!t)||n===c&&l.at(-1)===c||l.push(n)}return l},v=(e,l)=>({end:e.end,start:e.start,text:l,tokens:M(e.tokens,l)}),U=(e,l)=>{let s=v(e,l);return{...s,tokens:s.tokens.filter(n=>!n.isUnknown)}},Y=(e,l=" ")=>{let s=e.map(r=>r.text).join(l),n=e.flatMap(r=>r.tokens);return{end:e.at(-1).end,start:e[0].start,text:s,tokens:n}},q=(e,l)=>{let s=e.tokens.filter(o=>o.start<l),n=e.tokens.filter(o=>o.start>=l),r=s.map(o=>o.text).join(" "),t=n.map(o=>o.text).join(" ");return[{end:s.at(-1).end,start:e.start,text:r,tokens:s},{end:e.end,start:n[0].start,text:t,tokens:n}]},J=(e,l)=>{let s=x(l);for(let n=0;n<e.length;n++)if(k(e,s,n))return e[n];return null},Q=(e,l,s)=>{let{text:n,tokens:r}=e,t=0;for(let o of r){let i=n.indexOf(o.text,t);if(i===-1)continue;let u=i+o.text.length;if(t=u+1,l>=i&&s<=u)return o}return null};export{U as applyGroundTruthToSegment,P as cleanupIsolatedTokens,x as createHints,R as estimateSegmentFromToken,T as formatSecondsToTimestamp,_ as formatSegmentsToTimestampedTranscript,J as getFirstMatchingToken,Q as getFirstTokenForSelection,w as groupMarkedTokensIntoSegments,d as isEndingWithPunctuation,N as mapSegmentsIntoFormattedSegments,O as markAndCombineSegments,W as markTokensWithDividers,Y as mergeSegments,y as mergeShortSegmentsWithPrevious,h as normalizeWord,q as splitSegment,v as updateSegmentWithGroundTruth}; //# sourceMappingURL=index.js.map