UNPKG

detect-file-encoding-and-language

Version:

Charset Detector - Detect the encoding and language of text files - Use it in the browser, with Node.js, or via CLI

1 lines 16 kB
!function(f){"object"==typeof exports&&"undefined"!=typeof module?module.exports=f():"function"==typeof define&&define.amd?define([],f):("undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:this).languageEncoding=f()}(function(){return function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);throw(f=new Error("Cannot find module '"+i+"'")).code="MODULE_NOT_FOUND",f}c=n[i]={exports:{}},e[i][0].call(c.exports,function(r){return o(e[i][1][r]||r)},c,c.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i<t.length;i++)o(t[i]);return o}({1:[function(require,module,exports){const byteOrderMarks=require("../config/byteOrderMarkObject.js");module.exports=uInt8Start=>{for(const element of byteOrderMarks)if(element.regex.test(uInt8Start))return element.encoding;return null}},{"../config/byteOrderMarkObject.js":6}],2:[function(require,module,exports){module.exports=content=>{for(let b=0;b<content.length;b++)if("�"===content[b])return!1;return!0}},{}],3:[function(require,module,exports){const countAllMatches=require("./processing-content/countAllMatches.js"),calculateConfidenceScore=require("./processing-content/calculateConfidenceScore.js"),byteOrderMarkObject=require("../config/byteOrderMarkObject.js");module.exports=(data,fileInfo)=>{data.languageArr=countAllMatches(data,fileInfo.encoding),fileInfo.language=data.languageArr.reduce((acc,val)=>acc.count>val.count?acc:val).name,data.pos=data.languageArr.findIndex(elem=>elem.name===fileInfo.language),fileInfo.encoding||(fileInfo.encoding=data.languageArr[data.pos].encoding);var calculations=calculateConfidenceScore(data,fileInfo);return fileInfo.confidence.encoding||(fileInfo.confidence.encoding=calculations),fileInfo.confidence.language=calculations,data.languageArr[data.pos].count||(fileInfo.language=null,fileInfo.confidence.language=null,byteOrderMarkObject.some(obj=>obj.encoding===fileInfo.encoding))||(fileInfo.encoding=null,fileInfo.confidence.encoding=null),fileInfo}},{"../config/byteOrderMarkObject.js":6,"./processing-content/calculateConfidenceScore.js":4,"./processing-content/countAllMatches.js":5}],4:[function(require,module,exports){module.exports=(data,fileInfo)=>{var charRegex=new RegExp(/\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/,"g"),charRegex=data.content.replace(charRegex,"").length,langArr=data.languageArr,data=data.pos,secondLanguage=langArr.reduce((acc,val)=>acc.name!==fileInfo.language&&(val.name===fileInfo.language||acc.count>=val.count)?acc:val),secondLanguage=langArr[data].count/(secondLanguage.count+langArr[data].count),charRegex=langArr[data].count/charRegex;let lowerLimit=null,upperLimit=null;upperLimit="UTF-8"===fileInfo.encoding||"UTF-16LE"===fileInfo.encoding?(lowerLimit=langArr[data].utfFrequency?.8*langArr[data].utfFrequency.low:null,langArr[data].utfFrequency?(langArr[data].utfFrequency.low+langArr[data].utfFrequency.high)/2:null):(lowerLimit=langArr[data].isoFrequency?.8*langArr[data].isoFrequency.low:null,langArr[data].isoFrequency?(langArr[data].isoFrequency.low+langArr[data].isoFrequency.high)/2:null);let confidenceScore;return confidenceScore=lowerLimit&&upperLimit?charRegex>=upperLimit?1:charRegex>lowerLimit?(langArr=upperLimit-lowerLimit,data=charRegex-lowerLimit,Number((secondLanguage+(1-secondLanguage)*(data/langArr)).toFixed(2))):Number((secondLanguage*(charRegex/lowerLimit)).toFixed(2)):null}},{}],5:[function(require,module,exports){const languageArr=require("../../config/languageObject.js");module.exports=(data,encoding)=>{const newLanguageArr=[],regex=(languageArr.forEach(obj=>{const updatedLangObj={};Object.keys(obj).forEach(key=>{"count"!==key?updatedLangObj[key]=obj[key]:updatedLangObj.count=0}),newLanguageArr.push(updatedLangObj)}),encoding?"utfRegex":"isoRegex");return newLanguageArr.forEach(lang=>{var matches;lang[regex]&&(matches=data.content.match(lang[regex]))&&(lang.count=matches.length)}),newLanguageArr}},{"../../config/languageObject.js":7}],6:[function(require,module,exports){module.exports=[{encoding:"UTF-EBCDIC",regex:new RegExp("221 115 102 115")},{encoding:"GB-18030",regex:new RegExp("132 49 149 51")},{encoding:"UTF-32LE",regex:new RegExp("255 254 0 0")},{encoding:"UTF-32BE",regex:new RegExp("0 0 254 255")},{encoding:"UTF-8",regex:new RegExp("239 187 191")},{encoding:"UTF-7",regex:new RegExp("43 47 118")},{encoding:"UTF-1",regex:new RegExp("247 100 76")},{encoding:"SCSU",regex:new RegExp("14 254 255")},{encoding:"BOCU-1",regex:new RegExp("251 238 40")},{encoding:"UTF-16BE",regex:new RegExp("254 255")},{encoding:"UTF-16LE",regex:new RegExp("255 254")}]},{}],7:[function(require,module,exports){var sharedRegex={czech:new RegExp(/jsem|jsi/,"gi"),hungarian:new RegExp(/\snem\s/,"gi"),slovak:new RegExp(/poriadku|myslím|\ssme\s/,"gi"),slovenian:new RegExp(/\skaj\s|lahko|zdaj/,"gi"),albanian:new RegExp(/nuk/,"gi"),english:new RegExp(/ the /,"gi"),french:new RegExp(/c'est/,"gi"),portuguese:new RegExp(/ não /,"gi"),spanish:new RegExp(/estaba|\smuy\s|siempre|ahora/,"gi"),german:new RegExp(/\sdas\s/,"gi"),italian:new RegExp(/\sche\s/,"gi"),danish:new RegExp(/hvad|noget/,"gi"),norwegian:new RegExp(/deg/,"gi"),swedish:new RegExp(/ jag /,"gi"),dutch:new RegExp(/ het /,"gi"),finnish:new RegExp(/hän/,"gi"),"serbo-croatian":new RegExp(/ sam | kako /,"gi"),estonian:new RegExp(/\sseda\s|\spole\s|midagi/,"gi"),icelandic:new RegExp(/Það/,"gi"),"malay-indonesian":new RegExp(/tidak/,"gi"),turkish:new RegExp(/ bir /,"gi"),lithuanian:new RegExp(/taip|\stai\s/,"gi"),bengali:new RegExp(/এটা/,"gi"),hindi:new RegExp(/हैं/,"gi"),urdu:new RegExp(/ایک/,"gi"),vietnamese:new RegExp(/ không /,"gi")},sharedFrequency={polish:{low:.004355,high:.005102},czech:{low:.004433,high:.007324},hungarian:{low:.004994,high:.005183},romanian:{low:.003319,high:.00419},slovak:{low:.001736,high:.002557},slovenian:{low:.004111,high:.004959},albanian:{low:.003773,high:.007313},ukrainian:{low:.002933,high:.005389},english:{low:.004679,high:.00758},french:{low:.003016,high:.004825},portuguese:{low:.003406,high:.005032},spanish:{low:.002348,high:.002881},german:{low:.004044,high:.004391},italian:{low:.003889,high:.005175},danish:{low:.00363,high:.004189},norwegian:{low:.00241,high:.003918},swedish:{low:.004916,high:.007221},dutch:{low:.003501,high:.00415},finnish:{low:.003308,high:.005135},"serbo-croatian":{low:.002568,high:.005182},estonian:{low:.002892,high:.003963},icelandic:{low:.004366,high:.004366},"malay-indonesian":{low:.002825,high:.003932},greek:{low:.00344,high:.004862},turkish:{low:.002915,high:.004588},hebrew:{low:.003663,high:.004666},lithuanian:{low:.003277,high:.003768},bengali:{low:.003155,high:.005236},hindi:{low:.004159,high:.006478},urdu:{low:.004118,high:.005851},vietnamese:{low:.003387,high:.005191}};module.exports=[{name:"polish",count:0,utfRegex:new RegExp(/się/,"gi"),isoRegex:new RegExp(/siê/,"gi"),encoding:"CP1250",utfFrequency:sharedFrequency.polish,isoFrequency:sharedFrequency.polish},{name:"czech",count:0,utfRegex:sharedRegex.czech,isoRegex:sharedRegex.czech,encoding:"CP1250",utfFrequency:sharedFrequency.czech,isoFrequency:sharedFrequency.czech},{name:"hungarian",count:0,utfRegex:sharedRegex.hungarian,isoRegex:sharedRegex.hungarian,encoding:"CP1250",utfFrequency:sharedFrequency.hungarian,isoFrequency:sharedFrequency.hungarian},{name:"romanian",count:0,utfRegex:new RegExp(/sunt|eşti/,"gi"),isoRegex:new RegExp(/sunt|eºti/,"gi"),encoding:"CP1250",utfFrequency:sharedFrequency.romanian,isoFrequency:sharedFrequency.romanian},{name:"slovak",count:0,utfRegex:sharedRegex.slovak,isoRegex:sharedRegex.slovak,encoding:"CP1250",utfFrequency:sharedFrequency.slovak,isoFrequency:sharedFrequency.slovak},{name:"slovenian",count:0,utfRegex:sharedRegex.slovenian,isoRegex:sharedRegex.slovenian,encoding:"CP1250",utfFrequency:sharedFrequency.slovenian,isoFrequency:sharedFrequency.slovenian},{name:"albanian",count:0,utfRegex:sharedRegex.albanian,isoRegex:sharedRegex.albanian,encoding:"CP1250",utfFrequency:sharedFrequency.albanian,isoFrequency:sharedFrequency.albanian},{name:"russian",count:0,utfRegex:new RegExp(/что/,"gi"),isoRegex:new RegExp(/÷òî/,"gi"),encoding:"CP1251",utfFrequency:{low:.004965,high:.005341},isoFrequency:{low:.003884,high:.003986}},{name:"ukrainian",count:0,utfRegex:new RegExp(/він|але/,"gi"),isoRegex:new RegExp(/â³í|àëå/,"gi"),encoding:"CP1251",utfFrequency:sharedFrequency.ukrainian,isoFrequency:sharedFrequency.ukrainian},{name:"bulgarian",count:0,utfRegex:new RegExp(/това|какво/,"gi"),isoRegex:new RegExp(/òîâà|äîáðå|êaêâo/,"gi"),encoding:"CP1251",utfFrequency:{low:.005225,high:.005628},isoFrequency:{low:.002767,high:.004951}},{name:"english",count:0,utfRegex:sharedRegex.english,isoRegex:sharedRegex.english,encoding:"CP1252",utfFrequency:sharedFrequency.english,isoFrequency:sharedFrequency.english},{name:"french",count:0,utfRegex:sharedRegex.french,isoRegex:sharedRegex.french,encoding:"CP1252",utfFrequency:sharedFrequency.french,isoFrequency:sharedFrequency.french},{name:"portuguese",count:0,utfRegex:sharedRegex.portuguese,isoRegex:sharedRegex.portuguese,encoding:"CP1252",utfFrequency:sharedFrequency.portuguese,isoFrequency:sharedFrequency.portuguese},{name:"spanish",count:0,utfRegex:sharedRegex.spanish,isoRegex:sharedRegex.spanish,encoding:"CP1252",utfFrequency:sharedFrequency.spanish,isoFrequency:sharedFrequency.spanish},{name:"german",count:0,utfRegex:sharedRegex.german,isoRegex:sharedRegex.german,encoding:"CP1252",utfFrequency:sharedFrequency.german,isoFrequency:sharedFrequency.german},{name:"italian",count:0,utfRegex:sharedRegex.italian,isoRegex:sharedRegex.italian,encoding:"CP1252",utfFrequency:sharedFrequency.italian,isoFrequency:sharedFrequency.italian},{name:"danish",count:0,utfRegex:sharedRegex.danish,isoRegex:sharedRegex.danish,encoding:"CP1252",utfFrequency:sharedFrequency.danish,isoFrequency:sharedFrequency.danish},{name:"norwegian",count:0,utfRegex:sharedRegex.norwegian,isoRegex:sharedRegex.norwegian,encoding:"CP1252",utfFrequency:sharedFrequency.norwegian,isoFrequency:sharedFrequency.norwegian},{name:"swedish",count:0,utfRegex:sharedRegex.swedish,isoRegex:sharedRegex.swedish,encoding:"CP1252",utfFrequency:sharedFrequency.swedish,isoFrequency:sharedFrequency.swedish},{name:"dutch",count:0,utfRegex:sharedRegex.dutch,isoRegex:sharedRegex.dutch,encoding:"CP1252",utfFrequency:sharedFrequency.dutch,isoFrequency:sharedFrequency.dutch},{name:"finnish",count:0,utfRegex:sharedRegex.finnish,isoRegex:sharedRegex.finnish,encoding:"CP1252",utfFrequency:sharedFrequency.finnish,isoFrequency:sharedFrequency.finnish},{name:"serbo-croatian",count:0,utfRegex:sharedRegex["serbo-croatian"],isoRegex:sharedRegex["serbo-croatian"],encoding:"CP1252",utfFrequency:sharedFrequency["serbo-croatian"],isoFrequency:sharedFrequency["serbo-croatian"]},{name:"estonian",count:0,utfRegex:sharedRegex.estonian,isoRegex:sharedRegex.estonian,encoding:"CP1252",utfFrequency:sharedFrequency.estonian,isoFrequency:sharedFrequency.estonian},{name:"icelandic",count:0,utfRegex:sharedRegex.icelandic,isoRegex:sharedRegex.icelandic,encoding:"CP1252",utfFrequency:sharedFrequency.icelandic,isoFrequency:sharedFrequency.icelandic},{name:"malay-indonesian",count:0,utfRegex:sharedRegex["malay-indonesian"],isoRegex:sharedRegex["malay-indonesian"],encoding:"CP1252",utfFrequency:sharedFrequency["malay-indonesian"],isoFrequency:sharedFrequency["malay-indonesian"]},{name:"greek",count:0,utfRegex:new RegExp(/είναι/,"gi"),isoRegex:new RegExp(/åßíáé/,"gi"),encoding:"CP1253",utfFrequency:sharedFrequency.greek,isoFrequency:sharedFrequency.greek},{name:"turkish",count:0,utfRegex:sharedRegex.turkish,isoRegex:sharedRegex.turkish,encoding:"CP1254",utfFrequency:sharedFrequency.turkish,isoFrequency:sharedFrequency.turkish},{name:"hebrew",count:0,utfRegex:new RegExp(/אתה/,"gi"),isoRegex:new RegExp(/àúä/,"gi"),encoding:"CP1255",utfFrequency:sharedFrequency.hebrew,isoFrequency:sharedFrequency.hebrew},{name:"arabic",count:0,utfRegex:new RegExp(/هذا/,"gi"),isoRegex:new RegExp(/åðç/,"gi"),encoding:"CP1256",utfFrequency:{low:.003522,high:.004348},isoFrequency:{low:.003773,high:.005559}},{name:"farsi-persian",count:0,utfRegex:new RegExp(/اون/,"gi"),isoRegex:new RegExp(/çíä/,"gi"),encoding:"CP1256",utfFrequency:{low:.002761,high:.004856},isoFrequency:{low:.00301,high:.006646}},{name:"lithuanian",count:0,utfRegex:sharedRegex.lithuanian,isoRegex:sharedRegex.lithuanian,encoding:"CP1257",utfFrequency:sharedFrequency.lithuanian,isoFrequency:sharedFrequency.lithuanian},{name:"chinese-simplified",count:0,utfRegex:new RegExp(/么/,"gi"),isoRegex:new RegExp(/´ó|¶¯|Å®/,"gi"),encoding:"GB18030",utfFrequency:{low:.009567,high:.011502},isoFrequency:{low:.003137,high:.005009}},{name:"chinese-traditional",count:0,utfRegex:new RegExp(/們/,"gi"),isoRegex:new RegExp(/¦b/,"gi"),encoding:"BIG5",utfFrequency:{low:.012484,high:.014964},isoFrequency:{low:.005063,high:.005822}},{name:"japanese",count:0,utfRegex:new RegExp(/ど/,"gi"),isoRegex:new RegExp(/‚»|‚Á‚Ä/,"gi"),encoding:"Shift-JIS",utfFrequency:{low:.004257,high:.006585},isoFrequency:{low:.004286,high:.004653}},{name:"korean",count:0,utfRegex:new RegExp(/도/,"gi"),isoRegex:new RegExp(/àö¾î|å¾ß|¡¼­/,"gi"),encoding:"EUC-KR",utfFrequency:{low:.01091,high:.01367},isoFrequency:{low:.004118,high:.004961}},{name:"thai",count:0,utfRegex:new RegExp(/แฮร์รี่|พอตเตอร์/,"gi"),isoRegex:new RegExp(/áîãìãõè|¾íµàµíãì­/,"gi"),encoding:"TIS-620",utfFrequency:{low:.003194,high:.003468},isoFrequency:{low:.002091,high:.002303}},{name:"bengali",count:0,utfRegex:sharedRegex.bengali,isoRegex:sharedRegex.bengali,utfFrequency:sharedFrequency.bengali,isoFrequency:sharedFrequency.bengali},{name:"hindi",count:0,utfRegex:sharedRegex.hindi,isoRegex:sharedRegex.hindi,utfFrequency:sharedFrequency.hindi,isoFrequency:sharedFrequency.hindi},{name:"urdu",count:0,utfRegex:sharedRegex.urdu,isoRegex:sharedRegex.urdu,utfFrequency:sharedFrequency.urdu,isoFrequency:sharedFrequency.urdu},{name:"vietnamese",count:0,utfRegex:sharedRegex.vietnamese,isoRegex:sharedRegex.vietnamese,utfFrequency:sharedFrequency.vietnamese,isoFrequency:sharedFrequency.vietnamese}]},{}],8:[function(require,module,exports){const checkUTF=require("./components/checkUTF.js"),processContent=require("./components/processContent.js"),checkByteOrderMark=require("./components/checkByteOrderMark.js");module.exports=file=>new Promise((resolve,reject)=>{const fileInfo={encoding:null,language:null,confidence:{encoding:null,language:null}},data={},byteOrderMarkBuffer=new FileReader;byteOrderMarkBuffer.onload=()=>{var uInt8String=new Uint8Array(byteOrderMarkBuffer.result).slice(0,4).join(" "),uInt8String=checkByteOrderMark(uInt8String);if(uInt8String){fileInfo.encoding=uInt8String,fileInfo.confidence.encoding=1;const byteOrderMarkReader=new FileReader;byteOrderMarkReader.onload=()=>{data.content=byteOrderMarkReader.result,resolve(processContent(data,fileInfo))},byteOrderMarkReader.onerror=err=>{reject(err)},byteOrderMarkReader.readAsText(file,fileInfo.encoding)}else{const utfReader=new FileReader;utfReader.onload=()=>{var utfContent=utfReader.result,utf8=checkUTF(utfContent);if(utf8&&(fileInfo.encoding="UTF-8",fileInfo.confidence.encoding=1),utf8)data.content=utfContent,resolve(processContent(data,fileInfo));else{const isoReader=new FileReader;isoReader.onload=()=>{data.content=isoReader.result,resolve(processContent(data,fileInfo))},isoReader.readAsText(file,"ISO-8859-1")}},utfReader.onerror=err=>{reject(err)},utfReader.readAsText(file,"UTF-8")}},byteOrderMarkBuffer.onerror=err=>{reject(err)},byteOrderMarkBuffer.readAsArrayBuffer(file)})},{"./components/checkByteOrderMark.js":1,"./components/checkUTF.js":2,"./components/processContent.js":3}]},{},[8])(8)});