string-fix-broken-named-entities
Version:
Finds and fixes common and not so common broken named HTML entities, returns ranges array of fixes
13 lines (11 loc) • 11.1 kB
JavaScript
/**
* @name string-fix-broken-named-entities
* @fileoverview Finds and fixes common and not so common broken named HTML entities, returns ranges array of fixes
* @version 7.0.20
* @author Roy Revelt, Codsen Ltd
* @license MIT
* {@link https://codsen.com/os/string-fix-broken-named-entities/}
*/
import X from"leven";import{allNamedEntitiesSetOnly as w,allNamedEntitiesSetOnlyCaseInsensitive as K,entStartsWith as x,entEndsWith as q,brokenNamedEntities as J,decode as C,maxLength as M,uncertain as S}from"all-named-html-entities";import{left as T,right as N,rightSeq as k,leftSeq as ee}from"string-left-right";import{isPlainObject as te,isNumberChar as ne,hasOwnProp as O}from"codsen-utils";import{right as P}from"string-left-right";import{isNumberChar as B}from"codsen-utils";function I(e){return typeof e=="string"&&e.length===1&&(e.charCodeAt(0)>96&&e.charCodeAt(0)<123||e.charCodeAt(0)>47&&e.charCodeAt(0)<58||e.charCodeAt(0)>64&&e.charCodeAt(0)<91||e.charCodeAt(0)===35)}function G(e){return typeof e=="string"&&(e.charCodeAt(0)>96&&e.charCodeAt(0)<123||e.charCodeAt(0)>64&&e.charCodeAt(0)<91)}function U(e,d,h){let n=0,m=0,V=0,R=0,F=0,A="",E="";for(let y=d;y<h;y++)e[y].trim().length?E+=e[y]:F+=1,G(e[y])?n+=1:B(e[y])?(m+=1,A+=String(e[y])):e[y]==="#"?R+=1:V+=1;let s=!1;return!n&&m>V?s="deci":(m||n)&&(E[0]==="#"&&E[1].toLowerCase()==="x"&&(B(E[2])||G(E[2]))||E[0].toLowerCase()==="x"&&m&&!V)&&(s="hexi"),{probablyNumeric:s,lettersCount:n,numbersCount:m,numbersValue:A,hashesCount:R,othersCount:V,charTrimmed:E,whitespaceCount:F}}function z(e){return Array.isArray(e)&&e.length?e.length===1?e[0]:e.reduce((d,h)=>h.tempEnt.length>d.tempEnt.length?h:d):e}function j(e,d){if(arguments.length!==2)throw new Error("removeGappedFromMixedCases(): wrong amount of inputs!");let h;return Array.isArray(d)&&d.length&&(h=Array.from(d),h.length>1&&h.some(n=>e[P(e,n.tempRes.rightmostChar)]===";")&&h.some(n=>e[P(e,n.tempRes.rightmostChar)]!==";")&&(h=h.filter(n=>e[P(e,n.tempRes.rightmostChar)]===";")),!(h.every(n=>!n?.tempRes?.gaps||!Array.isArray(n.tempRes.gaps)||!n.tempRes.gaps.length)||h.every(n=>n?.tempRes?.gaps&&Array.isArray(n.tempRes.gaps)&&n.tempRes.gaps.length)))?z(h.filter(n=>!n.tempRes.gaps||!Array.isArray(n.tempRes.gaps)||!n.tempRes.gaps.length)):z(d)}var Q="7.0.20";var de=Q,he=[...w].map(e=>`bad-html-entity-malformed-${e}`).concat([...w].map(e=>`bad-html-entity-encoded-${e}`)).concat(["bad-html-entity-unrecognised","bad-html-entity-multiple-encoding","bad-html-entity-encoded-numeric","bad-html-entity-malformed-numeric","bad-html-entity-other"]);function pe(e,d){if(typeof e!="string")throw new Error(`string-fix-broken-named-entities: [THROW_ID_01] the first input argument must be string! It was given as:
${JSON.stringify(e,null,4)} (${typeof e}-type)`);let h={decode:!1,cb:({rangeFrom:t,rangeTo:p,rangeValEncoded:o,rangeValDecoded:f})=>f||o?[t,p,d?.decode?f:o]:[t,p],textAmpersandCatcherCb:null,progressFn:null,entityCatcherCb:null};if(d&&!te(d))throw new Error(`string-fix-broken-named-entities: [THROW_ID_02] the second input argument must be a plain object! I was given as:
${JSON.stringify(d,null,4)} (${typeof d}-type)`);let n={...h,...d};if(n.cb&&typeof n.cb!="function")throw new TypeError(`string-fix-broken-named-entities: [THROW_ID_03] resolvedOpts.cb must be a function (or falsey)! Currently it's: ${typeof n.cb}, equal to: ${JSON.stringify(n.cb,null,4)}`);if(n.entityCatcherCb&&typeof n.entityCatcherCb!="function")throw new TypeError(`string-fix-broken-named-entities: [THROW_ID_04] resolvedOpts.entityCatcherCb must be a function (or falsey)! Currently it's: ${typeof n.entityCatcherCb}, equal to: ${JSON.stringify(n.entityCatcherCb,null,4)}`);if(n.progressFn&&typeof n.progressFn!="function")throw new TypeError(`string-fix-broken-named-entities: [THROW_ID_05] resolvedOpts.progressFn must be a function (or falsey)! Currently it's: ${typeof n.progressFn}, equal to: ${JSON.stringify(n.progressFn,null,4)}`);if(n.textAmpersandCatcherCb&&typeof n.textAmpersandCatcherCb!="function")throw new TypeError(`string-fix-broken-named-entities: [THROW_ID_06] resolvedOpts.textAmpersandCatcherCb must be a function (or falsey)! Currently it's: ${typeof n.textAmpersandCatcherCb}, equal to: ${JSON.stringify(n.textAmpersandCatcherCb,null,4)}`);let m=[],V,R,F=e.length+1,A=0,E=null,s=null,y=null,L=[];function g(t,p){if(typeof n.textAmpersandCatcherCb=="function"&&L.length)for(;L.length;){let o=L.shift();(t===void 0||o<t||o===p)&&n.textAmpersandCatcherCb(o)}}for(let t=0;t<=F;t++){if(n.progressFn&&(V=Math.floor(A/F*100),V!==R&&(R=V,n.progressFn(V))),E)if(typeof E=="number"&&t>=E)E=null;else{A+=1;continue}if(s!==null&&t-s>50&&(s=null),s!==null&&(!e[t]||e[t].trim().length&&!I(e[t]))){if(t>s+1){let p=e.slice(s,t),o=T(e,s),f=o?T(e,o):null;if(e[o]==="&"&&(!e[t]||e[t]!==";")){let b=s,l=s?N(e,s):null;if(O(x,e[b])&&O(x[e[b]],e[l])){let r="",u,a=x[e[b]][e[l]].reduce((i,c)=>(u=k(e,s-1,...c.split("")),u?i.concat([{tempEnt:c,tempRes:u}]):i),[]);if(a=j(e,a),a&&({tempEnt:r,tempRes:u}=a),r&&(!Object.keys(S).includes(r)||!e[u.rightmostChar+1]||["&"].includes(e[u.rightmostChar+1])||(S[r].addSemiIfAmpPresent===!0||S[r].addSemiIfAmpPresent&&!e[u.rightmostChar+1]?.trim().length)&&e[u.leftmostChar-1]==="&")){let i=C(`&${r};`);m.push({ruleName:`bad-html-entity-malformed-${r}`,entityName:r,rangeFrom:o||0,rangeTo:u.rightmostChar+1,rangeValEncoded:`&${r};`,rangeValDecoded:i}),g(o||0,t)}}}else if(e[o]!=="&"&&e[f]!=="&"&&e[t]===";"){let b=T(e,t),l=T(e,b);if(l!==null&&O(q,e[b])&&O(q[e[b]],e[l])){let r="",u,a=q[e[b]][e[l]].reduce((i,c)=>(u=ee(e,t,...c.split("")),u&&!(c==="block"&&e[T(e,s)]===":")?i.concat([{tempEnt:c,tempRes:u}]):i),[]);if(a=j(e,a),a&&({tempEnt:r,tempRes:u}=a),r&&(!Object.keys(S).includes(r)||S[r].addAmpIfSemiPresent===!0||S[r].addAmpIfSemiPresent&&(!u.leftmostChar||typeof e[u.leftmostChar-1]=="string"&&!e[u.leftmostChar-1].trim().length))){let i=C(`&${r};`);m.push({ruleName:`bad-html-entity-malformed-${r}`,entityName:r,rangeFrom:u.leftmostChar,rangeTo:t+1,rangeValEncoded:`&${r};`,rangeValDecoded:i}),g(u.leftmostChar,t)}}else y!==null&&(m.push({ruleName:"bad-html-entity-malformed-numeric",entityName:null,rangeFrom:y,rangeTo:t+1,rangeValEncoded:null,rangeValDecoded:null}),g(y,t),y=null)}else if(e[t]===";"&&(e[o]==="&"||e[o]===";"&&e[f]==="&")){let b=s-1;if(!e[s-1].trim()&&e[o]==="&"&&(b=o),e.slice(o+1,t).trim().length>1){let l=U(e,o+1,t);if(l.probablyNumeric){if(l.probablyNumeric&&l.charTrimmed[0]==="#"&&!l.whitespaceCount&&(!l.lettersCount&&l.numbersCount>0&&!l.othersCount||(l.numbersCount||l.lettersCount)&&l.charTrimmed[1]==="x"&&!l.othersCount)){let r=String.fromCharCode(parseInt(l.charTrimmed.slice(l.probablyNumeric==="deci"?1:2),l.probablyNumeric==="deci"?10:16));l.probablyNumeric==="deci"&&parseInt(l.numbersValue,10)>918015?m.push({ruleName:"bad-html-entity-malformed-numeric",entityName:null,rangeFrom:o||0,rangeTo:t+1,rangeValEncoded:null,rangeValDecoded:null}):n.decode&&m.push({ruleName:"bad-html-entity-encoded-numeric",entityName:l.charTrimmed,rangeFrom:o||0,rangeTo:t+1,rangeValEncoded:`&${l.charTrimmed};`,rangeValDecoded:r}),g(o||0,t)}else m.push({ruleName:"bad-html-entity-malformed-numeric",entityName:null,rangeFrom:o||0,rangeTo:t+1,rangeValEncoded:null,rangeValDecoded:null}),g(o||0,t);n.entityCatcherCb&&n.entityCatcherCb(o,t+1)}else{let r=Array.from(p).filter($=>$.trim().length).join("");if(r.length<=M&&K.has(r.toLowerCase())){if(typeof r=="string"&&!w.has(r)){let $=[...w].filter(H=>H.toLowerCase()===r.toLowerCase());$.length===1?(m.push({ruleName:`bad-html-entity-malformed-${$[0]}`,entityName:$[0],rangeFrom:o,rangeTo:t+1,rangeValEncoded:`&${$[0]};`,rangeValDecoded:C(`&${$[0]};`)}),g(o,t)):(m.push({ruleName:"bad-html-entity-unrecognised",entityName:null,rangeFrom:o,rangeTo:t+1,rangeValEncoded:null,rangeValDecoded:null}),g(o,t))}else if(t-o-1!==r.length||e[o]!=="&"){let $=e[o]==="&"?o:f;if(Object.keys(S).includes(r)&&!e[$+1].trim().length){s=null;continue}m.push({ruleName:`bad-html-entity-malformed-${r}`,entityName:r,rangeFrom:$,rangeTo:t+1,rangeValEncoded:`&${r};`,rangeValDecoded:C(`&${r};`)}),g($,t)}else n.decode?(m.push({ruleName:`bad-html-entity-encoded-${r}`,entityName:r,rangeFrom:o,rangeTo:t+1,rangeValEncoded:`&${r};`,rangeValDecoded:C(`&${r};`)}),g(o,t)):(n.entityCatcherCb||n.textAmpersandCatcherCb)&&(n.entityCatcherCb&&n.entityCatcherCb(o,t+1),n.textAmpersandCatcherCb&&g(o,t));s=null;continue}let u=s,a=s?N(e,s):null,i="",c;if(O(J,l.charTrimmed.toLowerCase())){i=l.charTrimmed;let $=C(`&${J[l.charTrimmed.toLowerCase()]};`);m.push({ruleName:`bad-html-entity-malformed-${J[l.charTrimmed.toLowerCase()]}`,entityName:J[l.charTrimmed.toLowerCase()],rangeFrom:o,rangeTo:t+1,rangeValEncoded:`&${J[l.charTrimmed.toLowerCase()]};`,rangeValDecoded:$}),g(o,t)}else if(p.length<M+2&&((c=[...w].filter($=>X($,p)===1))&&c.length||(c=[...w].filter($=>X($,p)===2&&p.length>3))&&c.length)){if(c.length===1)[i]=c,m.push({ruleName:`bad-html-entity-malformed-${i}`,entityName:i,rangeFrom:o,rangeTo:t+1,rangeValEncoded:`&${i};`,rangeValDecoded:C(`&${i};`)}),g(o,t);else if(c){let $=c.map(D=>{let v=e.split("");return D.split("").reduce((_,Y)=>v.includes(Y)?(v.splice(v.indexOf(Y),1),_+1):_,0)}),H=Math.max(...$);if(H&&$.filter(D=>D===H).length===1){for(let D=0,v=$.length;D<v;D++)if($[D]===H){i=c[D],m.push({ruleName:`bad-html-entity-malformed-${i}`,entityName:i,rangeFrom:o,rangeTo:t+1,rangeValEncoded:`&${i};`,rangeValDecoded:C(`&${i};`)}),g(o,t);break}}}}i||(m.push({ruleName:"bad-html-entity-unrecognised",entityName:null,rangeFrom:o,rangeTo:t+1,rangeValEncoded:null,rangeValDecoded:null}),g(o,t))}}}else if(e[f]==="&"&&e[t]===";"&&t-f<M){let b=U(e,f+1,t);m.push({ruleName:`${b.probablyNumeric?"bad-html-entity-malformed-numeric":"bad-html-entity-unrecognised"}`,entityName:null,rangeFrom:f,rangeTo:t+1,rangeValEncoded:null,rangeValDecoded:null}),g(f,t)}}s=null}if(s===null&&I(e[t])&&e[t+1]&&(s=t),e[t]==="a"){let p=k(e,t,"m","p",";");if(p){let o=p.rightmostChar+1,f=k(e,p.rightmostChar,"a","m","p",";");if(f){o=f.rightmostChar+1;let a;do a=k(e,o-1,"a","m","p",";"),a&&(o=a.rightmostChar+1);while(a)}let b=N(e,o-1),l=b?N(e,b):null,r="",u;if(l&&O(x,e[b])&&O(x[e[b]],e[l])&&x[e[b]][e[l]].some(a=>{let i=k(e,o-1,...a.split(""));return i?(r=a,u=i,!0):!1})){E=b+r.length+1;let a=T(e,t)||0;if(e[a]==="&")m.push({ruleName:"bad-html-entity-multiple-encoding",entityName:r,rangeFrom:a,rangeTo:E,rangeValEncoded:`&${r};`,rangeValDecoded:C(`&${r};`)}),g(a,t);else if(a){let i=t,c="";e[t-1],typeof n.cb=="function"&&(m.push({ruleName:"bad-html-entity-multiple-encoding",entityName:r,rangeFrom:i,rangeTo:E,rangeValEncoded:`${c}&${r};`,rangeValDecoded:`${c}${C(`&${r};`)}`}),g(i,t))}}}}e[t]==="#"&&N(e,t)&&e[N(e,t)].toLowerCase()==="x"&&(!e[t-1]||e[T(e,t)]!=="&")&&ne(e[N(e,N(e,t))])&&(y=t),e[t]==="&"&&L.push(t),!e[t]&&typeof n.textAmpersandCatcherCb=="function"&&L.length&&g(),A+=1}if(!m.length)return[];let W=m.filter((t,p)=>m.every((o,f)=>p===f||!(t.rangeFrom>=o.rangeFrom&&t.rangeTo<o.rangeTo)));return typeof n.cb=="function"?W.map(n.cb):W}export{he as allRules,pe as fixEnt,de as version};