@inst/vscode-bin-darwin
Version:
BINARY ONLY - VSCode binary deployment for macOS
323 lines (295 loc) • 14.6 kB
JavaScript
/*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
* Mark Pilgrim - port to Python
* Shy Shalom - original C code
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
!function(jschardet) {
// This prober doesn't actually recognize a language or a charset.
// It is a helper prober for the use of the Hebrew model probers
////// General ideas of the Hebrew charset recognition //////
//
// Four main charsets exist in Hebrew:
// "ISO-8859-8" - Visual Hebrew
// "windows-1255" - Logical Hebrew
// "ISO-8859-8-I" - Logical Hebrew
// "x-mac-hebrew" - ?? Logical Hebrew ??
//
// Both "ISO" charsets use a completely identical set of code points, whereas
// "windows-1255" and "x-mac-hebrew" are two different proper supersets of
// these code points. windows-1255 defines additional characters in the range
// 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
// diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
// x-mac-hebrew defines similar additional code points but with a different
// mapping.
//
// As far as an average Hebrew text with no diacritics is concerned, all four
// charsets are identical with respect to code points. Meaning that for the
// main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
// (including final letters).
//
// The dominant difference between these charsets is their directionality.
// "Visual" directionality means that the text is ordered as if the renderer is
// not aware of a BIDI rendering algorithm. The renderer sees the text and
// draws it from left to right. The text itself when ordered naturally is read
// backwards. A buffer of Visual Hebrew generally looks like so:
// "[last word of first line spelled backwards] [whole line ordered backwards
// and spelled backwards] [first word of first line spelled backwards]
// [end of line] [last word of second line] ... etc' "
// adding punctuation marks, numbers and English text to visual text is
// naturally also "visual" and from left to right.
//
// "Logical" directionality means the text is ordered "naturally" according to
// the order it is read. It is the responsibility of the renderer to display
// the text from right to left. A BIDI algorithm is used to place general
// punctuation marks, numbers and English text in the text.
//
// Texts in x-mac-hebrew are almost impossible to find on the Internet. From
// what little evidence I could find, it seems that its general directionality
// is Logical.
//
// To sum up all of the above, the Hebrew probing mechanism knows about two
// charsets:
// Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
// backwards while line order is natural. For charset recognition purposes
// the line order is unimportant (In fact, for this implementation, even
// word order is unimportant).
// Logical Hebrew - "windows-1255" - normal, naturally ordered text.
//
// "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
// specifically identified.
// "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
// that contain special punctuation marks or diacritics is displayed with
// some unconverted characters showing as question marks. This problem might
// be corrected using another model prober for x-mac-hebrew. Due to the fact
// that x-mac-hebrew texts are so rare, writing another model prober isn't
// worth the effort and performance hit.
//
//////// The Prober ////////
//
// The prober is divided between two SBCharSetProbers and a HebrewProber,
// all of which are managed, created, fed data, inquired and deleted by the
// SBCSGroupProber. The two SBCharSetProbers identify that the text is in
// fact some kind of Hebrew, Logical or Visual. The final decision about which
// one is it is made by the HebrewProber by combining final-letter scores
// with the scores of the two SBCharSetProbers to produce a final answer.
//
// The SBCSGroupProber is responsible for stripping the original text of HTML
// tags, English characters, numbers, low-ASCII punctuation characters, spaces
// and new lines. It reduces any sequence of such characters to a single space.
// The buffer fed to each prober in the SBCS group prober is pure text in
// high-ASCII.
// The two SBCharSetProbers (model probers) share the same language model:
// Win1255Model.
// The first SBCharSetProber uses the model normally as any other
// SBCharSetProber does, to recognize windows-1255, upon which this model was
// built. The second SBCharSetProber is told to make the pair-of-letter
// lookup in the language model backwards. This in practice exactly simulates
// a visual Hebrew model using the windows-1255 logical Hebrew model.
//
// The HebrewProber is not using any language model. All it does is look for
// final-letter evidence suggesting the text is either logical Hebrew or visual
// Hebrew. Disjointed from the model probers, the results of the HebrewProber
// alone are meaningless. HebrewProber always returns 0.00 as confidence
// since it never identifies a charset by itself. Instead, the pointer to the
// HebrewProber is passed to the model probers as a helper "Name Prober".
// When the Group prober receives a positive identification from any prober,
// it asks for the name of the charset identified. If the prober queried is a
// Hebrew model prober, the model prober forwards the call to the
// HebrewProber to make the final decision. In the HebrewProber, the
// decision is made according to the final-letters scores maintained and Both
// model probers scores. The answer is returned in the form of the name of the
// charset identified, either "windows-1255" or "ISO-8859-8".
jschardet.HebrewProber = function() {
jschardet.CharSetProber.apply(this);
// windows-1255 / ISO-8859-8 code points of interest
var FINAL_KAF = '\xea'
var NORMAL_KAF = '\xeb'
var FINAL_MEM = '\xed'
var NORMAL_MEM = '\xee'
var FINAL_NUN = '\xef'
var NORMAL_NUN = '\xf0'
var FINAL_PE = '\xf3'
var NORMAL_PE = '\xf4'
var FINAL_TSADI = '\xf5'
var NORMAL_TSADI = '\xf6'
// Minimum Visual vs Logical final letter score difference.
// If the difference is below this, don't rely solely on the final letter score distance.
var MIN_FINAL_CHAR_DISTANCE = 5
// Minimum Visual vs Logical model score difference.
// If the difference is below this, don't rely at all on the model score distance.
var MIN_MODEL_DISTANCE = 0.01
var VISUAL_HEBREW_NAME = "ISO-8859-8"
var LOGICAL_HEBREW_NAME = "windows-1255"
var self = this;
function init() {
self._mLogicalProber = null;
self._mVisualProber = null;
self.reset();
}
this.reset = function() {
this._mFinalCharLogicalScore = 0;
this._mFinalCharVisualScore = 0;
// The two last characters seen in the previous buffer,
// mPrev and mBeforePrev are initialized to space in order to simulate a word
// delimiter at the beginning of the data
this._mPrev = " ";
this._mBeforePrev = " ";
// These probers are owned by the group prober.
}
this.setModelProbers = function(logicalProber, visualProber) {
this._mLogicalProber = logicalProber;
this._mVisualProber = visualProber;
}
this.isFinal = function(c) {
return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1;
}
this.isNonFinal = function(c) {
// The normal Tsadi is not a good Non-Final letter due to words like
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
// the Non-Final tsadi to appear at an end of a word even though this is not
// the case in the original text.
// The letters Pe and Kaf rarely display a related behavior of not being a
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
// these letters as Non-Final letters outweighs the damage since these words
// are quite rare.
return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1;
}
this.feed = function(aBuf) {
// Final letter analysis for logical-visual decision.
// Look for evidence that the received buffer is either logical Hebrew or
// visual Hebrew.
// The following cases are checked:
// 1) A word longer than 1 letter, ending with a final letter. This is an
// indication that the text is laid out "naturally" since the final letter
// really appears at the end. +1 for logical score.
// 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
// Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
// the Non-Final form of that letter. Exceptions to this rule are mentioned
// above in isNonFinal(). This is an indication that the text is laid out
// backwards. +1 for visual score
// 3) A word longer than 1 letter, starting with a final letter. Final letters
// should not appear at the beginning of a word. This is an indication that
// the text is laid out backwards. +1 for visual score.
//
// The visual score and logical score are accumulated throughout the text and
// are finally checked against each other in GetCharSetName().
// No checking for final letters in the middle of words is done since that case
// is not an indication for either Logical or Visual text.
//
// We automatically filter out all 7-bit characters (replace them with spaces)
// so the word boundary detection works properly. [MAP]
if( this.getState() == jschardet.Constants.notMe ) {
// Both model probers say it's not them. No reason to continue.
return jschardet.Constants.notMe;
}
aBuf = this.filterHighBitOnly(aBuf);
for( var i = 0, cur; i < aBuf.length; i++ ) {
cur = aBuf[i];
if( cur == " " ) {
// We stand on a space - a word just ended
if( this._mBeforePrev != " " ) {
// next-to-last char was not a space so self._mPrev is not a 1 letter word
if( this.isFinal(this._mPrev) ) {
// case (1) [-2:not space][-1:final letter][cur:space]
this._mFinalCharLogicalScore++;
} else if( this.isNonFinal(this._mPrev) ) {
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
this._mFinalCharVisualScore++;
}
}
} else {
// Not standing on a space
if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) {
// case (3) [-2:space][-1:final letter][cur:not space]
this._mFinalCharVisualScore++;
}
}
this._mBeforePrev = this._mPrev;
this._mPrev = cur;
}
// Forever detecting, till the end or until both model probers return eNotMe (handled above)
return jschardet.Constants.detecting;
}
this.getCharsetName = function() {
// Make the decision: is it Logical or Visual?
// If the final letter score distance is dominant enough, rely on it.
var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore;
if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) {
return LOGICAL_HEBREW_NAME;
}
if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) {
return VISUAL_HEBREW_NAME;
}
// It's not dominant enough, try to rely on the model scores instead.
var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence();
if( modelsub > MIN_MODEL_DISTANCE ) {
return LOGICAL_HEBREW_NAME;
}
if( modelsub < -MIN_MODEL_DISTANCE ) {
return VISUAL_HEBREW_NAME;
}
// Still no good, back to final letter distance, maybe it'll save the day.
if( finalsub < 0 ) {
return VISUAL_HEBREW_NAME;
}
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
return LOGICAL_HEBREW_NAME;
}
this.getState = function() {
// Remain active as long as any of the model probers are active.
if( this._mLogicalProber.getState() == jschardet.Constants.notMe &&
this._mVisualProber.getState() == jschardet.Constants.notMe ) {
return jschardet.Constants.notMe;
}
return jschardet.Constants.detecting;
}
init();
}
jschardet.HebrewProber.prototype = new jschardet.CharSetProber();
// https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf
if (!Array.prototype.indexOf)
{
Array.prototype.indexOf = function(elt /*, from*/)
{
var len = this.length >>> 0;
var from = Number(arguments[1]) || 0;
from = (from < 0)
? Math.ceil(from)
: Math.floor(from);
if (from < 0)
from += len;
for (; from < len; from++)
{
if (from in this &&
this[from] === elt)
return from;
}
return -1;
};
}
}(require('./init'));