@doc-tools/sentenizer
Version:
text segmentation into sentences
3 lines (2 loc) • 6.79 kB
JavaScript
import{compose as Dt,map as sr,trim as nr,anyPass as ir,zipWith as ur,call as ar}from"ramda";import{replace as vt,compose as t,filter as qt,split as Mt,match as u,view as it,defaultTo as r}from"ramda";var b=".?!\u2026",et=`"\u201E'`,rt="\xBB\u201D\u2019",ot="\\)\\]\\}>";import{lensIndex as st}from"ramda";var I=()=>st(0),y=()=>st(1);var $t=I(),T=t(r(""),it($t)),Ft=y(),P=t(r(""),it(Ft)),Ht=`([^${b}]*?[${b}]+)`,Gt="gmu",Kt=new RegExp(Ht,Gt),ut=t(qt(Boolean),Mt(Kt)),Qt=`([${b}]+)$`,Ut="gmu",at=new RegExp(Qt,Ut),h=t(r(""),vt(at)("")),Tr=t(r(""),T,u(at)),zt=/^\s*([^\s]+?)(?=\s|$)/,jt="mu",Zt=new RegExp(zt,jt),l=t(r(""),P,u(Zt)),B=t(l,h),Jt=/([^\s]+)\s*$/,Vt="mu",Xt=new RegExp(Jt,Vt),p=t(r(""),P,u(Xt)),c=t(p,h),ct=(o=10)=>{let i=`^.{0,${o}}`,n="gmu",g=new RegExp(i,n);return t(r(""),T,u(g))},lt=(o=10)=>{let i=`.{0,${o}}$`,n="gmu",g=new RegExp(i,n);return t(r(""),T,u(g))},Yt=/^\s/,te="gmu",ee=new RegExp(Yt,te),pt=t(r(""),T,u(ee)),re=/\s$/,oe="mu",se=new RegExp(re,oe),ft=t(r(""),T,u(se)),ne=`^([${et}]+)`,ie="mu",ue=new RegExp(ne,ie),xt=t(r(""),P,u(ue)),ae=`^([${rt}]+)`,ce="mu",le=new RegExp(ae,ce),mt=t(r(""),P,u(le)),pe=`^([${b}]+)`,fe="mu",xe=new RegExp(pe,fe),gt=t(r(""),P,u(xe)),me=`^([${ot}]+)`,ge="mu",Pe=new RegExp(me,ge),Pt=t(r(""),P,u(Pe)),he=/^(\s+)$/,Ee="gmu",Re=new RegExp(he,Ee),ht=t(r(""),T,u(Re)),de=/[^.](\.)$/,Ae="mu",Se=new RegExp(de,Ae),_=t(r(""),P,u(Se));import{call as f,zipWith as x,compose as e,map as Ct,all as m,not as ye,always as R}from"ramda";import{compose as O,curry as Te,flip as Ce,lte as be,uniq as Le,length as Et}from"ramda";var Rt=o=>O(Te(Ce(be))(o),Et),E=O(Rt(1),Le),a=O(Boolean,Et);import{compose as C,invoker as we,juxt as N,allPass as Ie,identity as dt,not as At,toLower as St,toUpper as Tt}from"ramda";var W=we(1,"charAt"),D=C(E,N([St,Tt])),v=C(At,D),L=Ie([C(C(At,D),W(0)),C(E,N([dt,St]),W(0))]),q=C(E,N([Tt,dt]));var _e=e(a,ft),bt=e(a,pt),M=e(m(Boolean),x(f,[_e,bt]),Ct(h)),$=e(m(Boolean),x(f,[R(!0),e(ye,bt)]),Ct(h)),F=e(m(Boolean),x(f,[R(!0),e(L,l)])),H=e(m(Boolean),x(f,[R(!0),e(a,gt,l)])),G=e(m(Boolean),x(f,[R(!0),e(a,xt)])),K=e(m(Boolean),x(f,[R(!0),e(a,mt,l)])),Q=e(m(Boolean),x(f,[R(!0),e(a,Pt,l)])),U=e(m(Boolean),x(f,[R(!0),e(a,ht)]));import{toUpper as ke,identity as Be,juxt as Oe,length as We,equals as Ne,allPass as De,compose as w,call as ve,always as qe,all as Me,zipWith as $e}from"ramda";var Fe=w(a,_),He=w(Ne(1),We,c),Ge=w(E,Oe([ke,Be]),c),Ke=w(v,c),Qe=De([Fe,He,Ge,Ke]),z=w(Me(Boolean),$e(ve,[Qe,qe(!0)]));import{or as Ue,view as Ot,join as ze,__ as d,compose as s,zipWith as je,call as Ze,prop as A,defaultTo as Wt,allPass as Z,anyPass as J,toLower as Nt}from"ramda";var Lt={\u0434\u0436:!0,ed:!0,\u044D\u0434:!0,\u0432\u0441:!0,md:!0,\u043C\u0434:!0},wt={\u0431\u0443\u043A\u0432:!0,\u0441\u0442:!0,\u0442\u0440\u0430\u0434:!0,\u043B\u0430\u0442:!0,\u0432\u0435\u043D\u0433:!0,\u0438\u0441\u043F:!0,\u043A\u0430\u0442:!0,\u0443\u043A\u0440:!0,\u043D\u0435\u043C:!0,\u0430\u043D\u0433\u043B:!0,\u0444\u0440:!0,\u0438\u0442\u0430\u043B:!0,\u0433\u0440\u0435\u0447:!0,\u0435\u0432\u0440:!0,\u0430\u0440\u0430\u0431:!0,\u044F\u043F:!0,\u0441\u043B\u0430\u0432:!0,\u043A\u0438\u0442:!0,\u0440\u0443\u0441:!0,\u0440\u0443\u0441\u0441\u043A:!0,\u043B\u0430\u0442\u0432:!0,\u0441\u043B\u043E\u0432\u0430\u0446\u043A:!0,\u0445\u043E\u0440\u0432:!0,mr:!0,mrs:!0,ms:!0,dr:!0,vs:!0,\u0441\u0432:!0,\u0430\u0440\u0445:!0,\u0437\u0430\u0432:!0,\u0437\u0430\u043C:!0,\u043F\u0440\u043E\u0444:!0,\u0430\u043A\u0430\u0434:!0,\u043A\u043D:!0,\u043A\u043E\u0440\u0440:!0,\u0440\u0435\u0434:!0,\u0433\u0440:!0,\u0441\u0440:!0,\u0447\u043B:!0,\u0438\u043C:!0,\u0442\u043E\u0432:!0,\u043D\u0430\u0447:!0,\u043F\u043E\u043B:!0,chap:!0,\u043F:!0,\u043F\u043F:!0,\u0447:!0,\u0447\u0447:!0,\u0433\u043B:!0,\u0430\u0431\u0437:!0,\u043F\u0442:!0,no:!0,\u043F\u0440\u043E\u0441\u043F:!0,\u043F\u0440:!0,\u0443\u043B:!0,\u0448:!0,\u0433:!0,\u0433\u043E\u0440:!0,\u0434:!0,\u0441\u0442\u0440:!0,\u043A:!0,\u043A\u043E\u0440\u043F:!0,\u043F\u0435\u0440:!0,\u043E\u0431\u043B:!0,\u044D\u0442:!0,\u043F\u043E\u043C:!0,\u0430\u0443\u0434:!0,\u043E\u0444:!0,\u043A\u043E\u043C:!0,\u043A\u043E\u043C\u043D:!0,\u043A\u0430\u0431:!0,\u0434\u043E\u043C\u043E\u0432\u043B\u0430\u0434:!0,\u043B\u0438\u0442:!0,\u0442:!0,\u0440\u043F:!0,\u043F\u043E\u0441:!0,\u0441:!0,\u0445:!0,\u043F\u043B:!0,bd:!0,\u043E:!0,\u043E\u0437:!0,\u0440:!0,\u0430:!0,\u043E\u0431\u0440:!0,\u0443\u043C:!0,\u043E\u043A:!0,\u043E\u0442\u043A\u0440:!0,\u043F\u0441:!0,ps:!0,upd:!0,\u0441\u043C:!0,\u043D\u0430\u043F\u0440:!0,\u0434\u043E\u043F:!0,\u044E\u0440:!0,\u0444\u0438\u0437:!0,\u0442\u0435\u043B:!0,\u0441\u0431:!0,\u0432\u043D\u0443\u0442\u0440:!0,\u0434\u0438\u0444\u0444:!0,\u0433\u043E\u0441:!0,\u043E\u0442\u043C:!0},It={\u0434\u0435\u0441:!0,\u0442\u044B\u0441:!0,\u043C\u043B\u043D:!0,\u043C\u043B\u0440\u0434:!0,\u0434\u043E\u043B:!0,\u0434\u043E\u043B\u043B:!0,\u043A\u043E\u043F:!0,\u0440\u0443\u0431:!0,\u0440:!0,\u043F\u0440\u043E\u0446:!0,\u0433\u0430:!0,\u0431\u0430\u0440\u0440:!0,\u043A\u0443\u0431:!0,\u043A\u0432:!0,\u043A\u043C:!0,\u0441\u043C:!0,\u0447\u0430\u0441:!0,\u043C\u0438\u043D:!0,\u0441\u0435\u043A:!0,\u0432:!0,\u0432\u0432:!0,\u0433:!0,\u0433\u0433:!0,\u0441:!0,\u0441\u0442\u0440:!0,co:!0,corp:!0,inc:!0,\u0438\u0437\u0434:!0,ed:!0,\u0434\u0440:!0,al:!0},yt={\u0441\u043E\u043A\u0440:!0,\u0440\u0438\u0441:!0,\u0438\u0441\u043A\u043B:!0,\u043F\u0440\u0438\u043C:!0,\u044F\u0437:!0,\u0443\u0441\u0442\u0430\u0440:!0,\u0448\u0443\u0442\u043B:!0},_t={"\u0442.\u0435":!0,"\u0442.\u043A":!0,"\u0438.\u043E":!0,"\u043A.\u043D":!0,"\u043A.\u043F":!0,"\u043F.\u043D":!0,"\u043A.\u0442":!0,"\u0442.\u043D":!0,"\u043B.\u0434":!0},kt={"\u0442.\u043F":!0,"\u0447.\u0442":!0,"\u0442.\u0434":!0,"\u0443.\u0435":!0,"\u043D.\u044D":!0,"p.m":!0,"a.m":!0,"\u0441.\u0433":!0,"\u0440.\u0445":!0,"\u0441.\u0448":!0,"\u0437.\u0434":!0,"\u043B.\u0441":!0},Bt={"\u0435\u0434.\u0447":!0,"\u043C\u043D.\u0447":!0,"\u043F\u043E\u0432\u0435\u043B.\u043D\u0430\u043A\u043B":!0,"\u0436\u0435\u043D.\u0440":!0,"\u043C\u0443\u0436.\u0440":!0};var k=s(Wt(""),Ot(I())),Je=s(Wt(""),Ot(y())),V=s(a,_),Ve=s(Nt,ze(".")),Xe=J([A(d,_t),A(d,kt),A(d,Bt)]),j=s(Xe,Ve,je(Ze,[s(c,p),s(B,l)])),X=Z([s(V,p,k),j]),Ye=J([A(d,Lt),A(d,wt),A(d,It),A(d,yt)]),tr=s(Ye,Nt,c,p),Y=s(Z([s(V,p),tr]),k),er=s(J([L,q]),B,Je),rr=o=>i=>o.slice(0,Math.max(o.indexOf(i),0)),or=o=>{let i=rr(o),n=s(h,c,i,c,p);return Ue(j([n(o),c(o)]),j(c(o).split(".")))},tt=Z([s(V,p,k),s(or,k),er]);var cr=lt(20),lr=ct(20),pr=[cr,lr],fr=ir([M,$,F,H,G,K,Q,U,z,Y,X,tt]),xr=Dt(fr,ur(ar,pr));function mr(o){let i=ut(o),n=null,g=[];for(let S=0;S<i.length;S++){if(!n){n=i[S];continue}xr([n,i[S]])?n+=i[S]:(g.push(n),n=i[S])}return n&&g.push(n),g}var gr=sr(nr),Pr=Dt(gr,mr);export{Pr as sentenize};
//# sourceMappingURL=index.js.map