UNPKG

@inst/vscode-bin-darwin

Version:

BINARY ONLY - VSCode binary deployment for macOS

323 lines (295 loc) 14.6 kB
/* * The Original Code is Mozilla Universal charset detector code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 2001 * the Initial Developer. All Rights Reserved. * * Contributor(s): * António Afonso (antonio.afonso gmail.com) - port to JavaScript * Mark Pilgrim - port to Python * Shy Shalom - original C code * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ !function(jschardet) { // This prober doesn't actually recognize a language or a charset. // It is a helper prober for the use of the Hebrew model probers ////// General ideas of the Hebrew charset recognition ////// // // Four main charsets exist in Hebrew: // "ISO-8859-8" - Visual Hebrew // "windows-1255" - Logical Hebrew // "ISO-8859-8-I" - Logical Hebrew // "x-mac-hebrew" - ?? Logical Hebrew ?? // // Both "ISO" charsets use a completely identical set of code points, whereas // "windows-1255" and "x-mac-hebrew" are two different proper supersets of // these code points. windows-1255 defines additional characters in the range // 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific // diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. // x-mac-hebrew defines similar additional code points but with a different // mapping. // // As far as an average Hebrew text with no diacritics is concerned, all four // charsets are identical with respect to code points. Meaning that for the // main Hebrew alphabet, all four map the same values to all 27 Hebrew letters // (including final letters). // // The dominant difference between these charsets is their directionality. // "Visual" directionality means that the text is ordered as if the renderer is // not aware of a BIDI rendering algorithm. The renderer sees the text and // draws it from left to right. The text itself when ordered naturally is read // backwards. A buffer of Visual Hebrew generally looks like so: // "[last word of first line spelled backwards] [whole line ordered backwards // and spelled backwards] [first word of first line spelled backwards] // [end of line] [last word of second line] ... etc' " // adding punctuation marks, numbers and English text to visual text is // naturally also "visual" and from left to right. // // "Logical" directionality means the text is ordered "naturally" according to // the order it is read. It is the responsibility of the renderer to display // the text from right to left. A BIDI algorithm is used to place general // punctuation marks, numbers and English text in the text. // // Texts in x-mac-hebrew are almost impossible to find on the Internet. From // what little evidence I could find, it seems that its general directionality // is Logical. // // To sum up all of the above, the Hebrew probing mechanism knows about two // charsets: // Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are // backwards while line order is natural. For charset recognition purposes // the line order is unimportant (In fact, for this implementation, even // word order is unimportant). // Logical Hebrew - "windows-1255" - normal, naturally ordered text. // // "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be // specifically identified. // "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew // that contain special punctuation marks or diacritics is displayed with // some unconverted characters showing as question marks. This problem might // be corrected using another model prober for x-mac-hebrew. Due to the fact // that x-mac-hebrew texts are so rare, writing another model prober isn't // worth the effort and performance hit. // //////// The Prober //////// // // The prober is divided between two SBCharSetProbers and a HebrewProber, // all of which are managed, created, fed data, inquired and deleted by the // SBCSGroupProber. The two SBCharSetProbers identify that the text is in // fact some kind of Hebrew, Logical or Visual. The final decision about which // one is it is made by the HebrewProber by combining final-letter scores // with the scores of the two SBCharSetProbers to produce a final answer. // // The SBCSGroupProber is responsible for stripping the original text of HTML // tags, English characters, numbers, low-ASCII punctuation characters, spaces // and new lines. It reduces any sequence of such characters to a single space. // The buffer fed to each prober in the SBCS group prober is pure text in // high-ASCII. // The two SBCharSetProbers (model probers) share the same language model: // Win1255Model. // The first SBCharSetProber uses the model normally as any other // SBCharSetProber does, to recognize windows-1255, upon which this model was // built. The second SBCharSetProber is told to make the pair-of-letter // lookup in the language model backwards. This in practice exactly simulates // a visual Hebrew model using the windows-1255 logical Hebrew model. // // The HebrewProber is not using any language model. All it does is look for // final-letter evidence suggesting the text is either logical Hebrew or visual // Hebrew. Disjointed from the model probers, the results of the HebrewProber // alone are meaningless. HebrewProber always returns 0.00 as confidence // since it never identifies a charset by itself. Instead, the pointer to the // HebrewProber is passed to the model probers as a helper "Name Prober". // When the Group prober receives a positive identification from any prober, // it asks for the name of the charset identified. If the prober queried is a // Hebrew model prober, the model prober forwards the call to the // HebrewProber to make the final decision. In the HebrewProber, the // decision is made according to the final-letters scores maintained and Both // model probers scores. The answer is returned in the form of the name of the // charset identified, either "windows-1255" or "ISO-8859-8". jschardet.HebrewProber = function() { jschardet.CharSetProber.apply(this); // windows-1255 / ISO-8859-8 code points of interest var FINAL_KAF = '\xea' var NORMAL_KAF = '\xeb' var FINAL_MEM = '\xed' var NORMAL_MEM = '\xee' var FINAL_NUN = '\xef' var NORMAL_NUN = '\xf0' var FINAL_PE = '\xf3' var NORMAL_PE = '\xf4' var FINAL_TSADI = '\xf5' var NORMAL_TSADI = '\xf6' // Minimum Visual vs Logical final letter score difference. // If the difference is below this, don't rely solely on the final letter score distance. var MIN_FINAL_CHAR_DISTANCE = 5 // Minimum Visual vs Logical model score difference. // If the difference is below this, don't rely at all on the model score distance. var MIN_MODEL_DISTANCE = 0.01 var VISUAL_HEBREW_NAME = "ISO-8859-8" var LOGICAL_HEBREW_NAME = "windows-1255" var self = this; function init() { self._mLogicalProber = null; self._mVisualProber = null; self.reset(); } this.reset = function() { this._mFinalCharLogicalScore = 0; this._mFinalCharVisualScore = 0; // The two last characters seen in the previous buffer, // mPrev and mBeforePrev are initialized to space in order to simulate a word // delimiter at the beginning of the data this._mPrev = " "; this._mBeforePrev = " "; // These probers are owned by the group prober. } this.setModelProbers = function(logicalProber, visualProber) { this._mLogicalProber = logicalProber; this._mVisualProber = visualProber; } this.isFinal = function(c) { return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1; } this.isNonFinal = function(c) { // The normal Tsadi is not a good Non-Final letter due to words like // 'lechotet' (to chat) containing an apostrophe after the tsadi. This // apostrophe is converted to a space in FilterWithoutEnglishLetters causing // the Non-Final tsadi to appear at an end of a word even though this is not // the case in the original text. // The letters Pe and Kaf rarely display a related behavior of not being a // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for // example legally end with a Non-Final Pe or Kaf. However, the benefit of // these letters as Non-Final letters outweighs the damage since these words // are quite rare. return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1; } this.feed = function(aBuf) { // Final letter analysis for logical-visual decision. // Look for evidence that the received buffer is either logical Hebrew or // visual Hebrew. // The following cases are checked: // 1) A word longer than 1 letter, ending with a final letter. This is an // indication that the text is laid out "naturally" since the final letter // really appears at the end. +1 for logical score. // 2) A word longer than 1 letter, ending with a Non-Final letter. In normal // Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with // the Non-Final form of that letter. Exceptions to this rule are mentioned // above in isNonFinal(). This is an indication that the text is laid out // backwards. +1 for visual score // 3) A word longer than 1 letter, starting with a final letter. Final letters // should not appear at the beginning of a word. This is an indication that // the text is laid out backwards. +1 for visual score. // // The visual score and logical score are accumulated throughout the text and // are finally checked against each other in GetCharSetName(). // No checking for final letters in the middle of words is done since that case // is not an indication for either Logical or Visual text. // // We automatically filter out all 7-bit characters (replace them with spaces) // so the word boundary detection works properly. [MAP] if( this.getState() == jschardet.Constants.notMe ) { // Both model probers say it's not them. No reason to continue. return jschardet.Constants.notMe; } aBuf = this.filterHighBitOnly(aBuf); for( var i = 0, cur; i < aBuf.length; i++ ) { cur = aBuf[i]; if( cur == " " ) { // We stand on a space - a word just ended if( this._mBeforePrev != " " ) { // next-to-last char was not a space so self._mPrev is not a 1 letter word if( this.isFinal(this._mPrev) ) { // case (1) [-2:not space][-1:final letter][cur:space] this._mFinalCharLogicalScore++; } else if( this.isNonFinal(this._mPrev) ) { // case (2) [-2:not space][-1:Non-Final letter][cur:space] this._mFinalCharVisualScore++; } } } else { // Not standing on a space if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) { // case (3) [-2:space][-1:final letter][cur:not space] this._mFinalCharVisualScore++; } } this._mBeforePrev = this._mPrev; this._mPrev = cur; } // Forever detecting, till the end or until both model probers return eNotMe (handled above) return jschardet.Constants.detecting; } this.getCharsetName = function() { // Make the decision: is it Logical or Visual? // If the final letter score distance is dominant enough, rely on it. var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore; if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) { return LOGICAL_HEBREW_NAME; } if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) { return VISUAL_HEBREW_NAME; } // It's not dominant enough, try to rely on the model scores instead. var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence(); if( modelsub > MIN_MODEL_DISTANCE ) { return LOGICAL_HEBREW_NAME; } if( modelsub < -MIN_MODEL_DISTANCE ) { return VISUAL_HEBREW_NAME; } // Still no good, back to final letter distance, maybe it'll save the day. if( finalsub < 0 ) { return VISUAL_HEBREW_NAME; } // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. return LOGICAL_HEBREW_NAME; } this.getState = function() { // Remain active as long as any of the model probers are active. if( this._mLogicalProber.getState() == jschardet.Constants.notMe && this._mVisualProber.getState() == jschardet.Constants.notMe ) { return jschardet.Constants.notMe; } return jschardet.Constants.detecting; } init(); } jschardet.HebrewProber.prototype = new jschardet.CharSetProber(); // https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf if (!Array.prototype.indexOf) { Array.prototype.indexOf = function(elt /*, from*/) { var len = this.length >>> 0; var from = Number(arguments[1]) || 0; from = (from < 0) ? Math.ceil(from) : Math.floor(from); if (from < 0) from += len; for (; from < len; from++) { if (from in this && this[from] === elt) return from; } return -1; }; } }(require('./init'));