UNPKG

@makakwastaken/ts-edifact

Version:
1,327 lines (1,326 loc) 25.3 kB
/** * @author Roman Vottner * @copyright 2020 Roman Vottner * @license Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { Separators } from './edi/separators'; export class Charset { name; alpha; alphanumeric; numeric; decimal; constructor(name, configuration, admissibleAlphabet, unicode = false){ this.name = name; const exclude = configuration.delimiters(); const alphas = this.compile(admissibleAlphabet, exclude, unicode); this.alpha = alphas[0]; this.alphanumeric = alphas[1]; // parsing decimals is a multiple step process. First, the numeric part will be parsed, then the decimal separator // added and last the decimal part of the value added to the end of that value. So no need to catch the decimal // value with a regular expression actually if (unicode) { this.numeric = /[-]?[\p{Nd}]*/gu; this.decimal = /[\p{Nd}]*/gu; } else { this.numeric = /[-]?[0-9]*/g; this.decimal = /[0-9]*/g; } } compile(admissibleAlphabet, excludes, unicode = false) { // String.fromCharCode(parseInt("\u002F".codePointAt(0).toString(16), 16)) --> '/' const flag = unicode ? 'gu' : 'g'; let output = ''; for (const seq of admissibleAlphabet){ if (seq.length > 1) { const start = seq[0].codePointAt(0) // '/' --> 47 as the 47 character in the codepage ; const end = seq[1].codePointAt(0); if (start && end) { for(let i = start; i <= end; i++){ if (!excludes.includes(i)) { output += Separators.escapeIfNeeded(String.fromCodePoint(i)); } } } } else { const idx = seq[0].codePointAt(0); if (idx) { output += Separators.escapeIfNeeded(String.fromCodePoint(idx)); } } } const ret = []; ret.push(new RegExp(`[${output}]*`, flag)); ret.push(new RegExp(`[0-9${output}]*`, flag)); return ret; } } /* enum Modes { alphanumeric = 0, alpha = 1, numeric = 2, decimal = 3 } */ let UNOA = class UNOA extends Charset { static charset = [ [ '\u0020' ], [ '\u0028', '\u0029' ], [ '\u002C', '\u002F' ], [ '\u003D' ], [ '\u0041', '\u005A' ] ]; constructor(config){ super('UNOA', config, UNOA.charset); } }; let UNOB = class UNOB extends Charset { static charset = [ [ '\u0020', '\u0022' ], [ '\u0041', '\u005A' ], [ '\u0025', '\u002F' ], [ '\u003A', '\u003F' ], [ '\u0061', '\u007A' ], [ '\u005E', '\u005F' ] ]; constructor(config){ super('UNOB', config, UNOB.charset); } }; let UNOC = class UNOC extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Codepage_layout static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0', '\u00FF' ] ]; constructor(config){ super('UNOC', config, UNOC.charset); } }; let UNOD = class UNOD extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-2 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u0104' ], [ '\u02D8' ], [ '\u0141' ], [ '\u00A4' ], [ '\u013D' ], [ '\u015A' ], [ '\u00A7' ], [ '\u0048' ], [ '\u0160' ], [ '\u015E' ], [ '\u0164' ], [ '\u0179' ], [ '\u00AD' ], [ '\u017D' ], [ '\u017B' ], [ '\u00B0' ], [ '\u0105' ], [ '\u02DB' ], [ '\u0142' ], [ '\u00B4' ], [ '\u013E' ], [ '\u015B' ], [ '\u02C7' ], [ '\u00B8' ], [ '\u0161' ], [ '\u015F' ], [ '\u0165' ], [ '\u017A' ], [ '\u02DD' ], [ '\u017E' ], [ '\u017C' ], [ '\u0154' ], [ '\u00C1' ], [ '\u00C2' ], [ '\u0102' ], [ '\u00C4' ], [ '\u0139' ], [ '\u0106' ], [ '\u00C7' ], [ '\u010C' ], [ '\u00C9' ], [ '\u0118' ], [ '\u00CB' ], [ '\u011A' ], [ '\u00CD' ], [ '\u00CE' ], [ '\u010E' ], [ '\u0110' ], [ '\u0143' ], [ '\u0147' ], [ '\u00D3' ], [ '\u00D4' ], [ '\u0150' ], [ '\u00D6' ], [ '\u00D7' ], [ '\u0158' ], [ '\u016E' ], [ '\u00DA' ], [ '\u0170' ], [ '\u00DC' ], [ '\u00DD' ], [ '\u0162' ], [ '\u00DF' ], [ '\u0155' ], [ '\u00E1' ], [ '\u00E2' ], [ '\u0103' ], [ '\u00E4' ], [ '\u013A' ], [ '\u0107' ], [ '\u00E7' ], [ '\u010D' ], [ '\u00E9' ], [ '\u0119' ], [ '\u00EB' ], [ '\u011B' ], [ '\u00ED' ], [ '\u00EE' ], [ '\u010F' ], [ '\u0111' ], [ '\u0144' ], [ '\u0148' ], [ '\u00F3' ], [ '\u00F4' ], [ '\u0151' ], [ '\u00F6' ], [ '\u00F7' ], [ '\u0159' ], [ '\u016F' ], [ '\u00FA' ], [ '\u0171' ], [ '\u00FC' ], [ '\u00FD' ], [ '\u0163' ], [ '\u02D9' ] ]; constructor(config){ super('UNOD', config, UNOD.charset); } }; let UNOE = class UNOE extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-5 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u0401', '\u040C' ], [ '\u00AD' ], [ '\u040E', '\u044F' ], [ '\u2116' ], [ '\u0451', '\u045C' ], [ '\u00A7' ], [ '\u045E' ], [ '\u045F' ] ]; constructor(config){ super('UNOE', config, UNOE.charset); } }; let UNOF = class UNOF extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-7 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u2018', '\u2019' ], [ '\u00A3' ], [ '\u20AC' ], [ '\u20AF' ], [ '\u00A6', '\u00A9' ], [ '\u037A' ], [ '\u00AB', '\u00AD' ], [ '\u2015' ], [ '\u00B0', '\u00B3' ], [ '\u0384', '\u0386' ], [ '\u00B7' ], [ '\u0388', '\u038A' ], [ '\u00BB' ], [ '\u038C', '\u03A1' ], [ '\u03A3', '\u03CE' ] ]; constructor(config){ super('UNOF', config, UNOF.charset); } }; let UNOG = class UNOG extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-3 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u0126' ], [ '\u02D8' ], [ '\u00A3' ], [ '\u00A4' ], [ '\u0124' ], [ '\u00A7' ], [ '\u00A8' ], [ '\u0130' ], [ '\u015E' ], [ '\u011E' ], [ '\u0134' ], [ '\u00AD' ], [ '\u017B' ], [ '\u00B0' ], [ '\u0127' ], [ '\u00B2' ], [ '\u00B3', '\u00B5' ], [ '\u0125' ], [ '\u00B7' ], [ '\u00B8' ], [ '\u0131' ], [ '\u015F' ], [ '\u011F' ], [ '\u0135' ], [ '\u00BD' ], [ '\u017C' ], [ '\u00C0', '\u00C2' ], [ '\u00C4' ], [ '\u010A' ], [ '\u0108' ], [ '\u00C7', '\u00CF' ], [ '\u00D1', '\u00D4' ], [ '\u0120' ], [ '\u00D6' ], [ '\u00D7' ], [ '\u011C' ], [ '\u00D9', '\u00DC' ], [ '\u016C' ], [ '\u015C' ], [ '\u00DF', '\u00E2' ], [ '\u00E4' ], [ '\u010B' ], [ '\u0109' ], [ '\u00E7', '\u00EF' ], [ '\u00F1', '\u00F4' ], [ '\u0121' ], [ '\u00F6' ], [ '\u00F7' ], [ '\u011D' ], [ '\u00F9', '\u00FC' ], [ '\u016D' ], [ '\u015D' ], [ '\u02D9' ] ]; constructor(config){ super('UNOG', config, UNOG.charset); } }; let UNOH = class UNOH extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-4 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u0104' ], [ '\u0138' ], [ '\u0156' ], [ '\u00A4' ], [ '\u0128' ], [ '\u013B' ], [ '\u00A7' ], [ '\u00A8' ], [ '\u0160' ], [ '\u0112' ], [ '\u0122' ], [ '\u0166' ], [ '\u00AD' ], [ '\u017D' ], [ '\u00AF' ], [ '\u00B0' ], [ '\u0105' ], [ '\u02DB' ], [ '\u0157' ], [ '\u00B4' ], [ '\u0129' ], [ '\u013C' ], [ '\u02C7' ], [ '\u00B8' ], [ '\u0161' ], [ '\u0113' ], [ '\u0123' ], [ '\u0167' ], [ '\u014A' ], [ '\u017E' ], [ '\u014B' ], [ '\u0100' ], [ '\u00C1', '\u00C6' ], [ '\u012E' ], [ '\u010C' ], [ '\u00C9' ], [ '\u0118' ], [ '\u00CB' ], [ '\u0116' ], [ '\u00CD' ], [ '\u00CE' ], [ '\u012A' ], [ '\u0110' ], [ '\u0145' ], [ '\u014C' ], [ '\u0136' ], [ '\u00D4', '\u00D8' ], [ '\u0172' ], [ '\u00DA', '\u00DC' ], [ '\u0168' ], [ '\u016A' ], [ '\u00DF' ], [ '\u0101' ], [ '\u00E1', '\u00E6' ], [ '\u012F' ], [ '\u010D' ], [ '\u00E9' ], [ '\u0119' ], [ '\u00EB' ], [ '\u0117' ], [ '\u00ED' ], [ '\u00EE' ], [ '\u0128' ], [ '\u0111' ], [ '\u0146' ], [ '\u014D' ], [ '\u0137' ], [ '\u00F4', '\u00F8' ], [ '\u0173' ], [ '\u00FA', '\u00FC' ], [ '\u0169' ], [ '\u0168' ], [ '\u02D9' ] ]; constructor(config){ super('UNOH', config, UNOH.charset); } }; let UNOI = class UNOI extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-6 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u00A4' ], [ '\u060C' ], [ '\u00AD' ], [ '\u061B' ], [ '\u061F' ], [ '\u0621', '\u063A' ], [ '\u0640', '\u0652' ] ]; constructor(config){ super('UNOI', config, UNOI.charset); } }; let UNOJ = class UNOJ extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-8 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0' ], [ '\u00A2', '\u00A9' ], [ '\u00D7' ], [ '\u00AB', '\u00B9' ], [ '\u00F7' ], [ '\u00BB', '\u00BE' ], [ '\u2017' ], [ '\u05D0', '\u05EA' ], [ '\u200E' ], [ '\u200F' ] ]; constructor(config){ super('UNOJ', config, UNOJ.charset); } }; let UNOK = class UNOK extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-9 static charset = [ [ '\u0020', '\u002F' ], [ '\u003A', '\u007E' ], [ '\u00A0', '\u00CF' ], [ '\u011E' ], [ '\u00D1', '\u00DC' ], [ '\u0130' ], [ '\u015E' ], [ '\u00DF', '\u00EF' ], [ '\u011F' ], [ '\u00F1', '\u00FC' ], [ '\u0131' ], [ '\u015F' ], [ '\u00FF' ] ]; constructor(config){ super('UNOK', config, UNOK.charset); } }; // // TODO // class UNOX extends Charset { // // https://en.wikipedia.org/wiki/ISO/IEC_2022#ISO-2022-JP // private static charset: string[][] = []; // constructor(config: Configuration) { // super("UNOX", config, UNOX.charset); // } // } // class UNOY extends Charset { // // https://en.wikipedia.org/wiki/Universal_Coded_Character_Set // private static charset: string[][] = [ // ["\u0020", "\u002F"], // ["\u003A", "\u007E"], // ["\u00A0", "\u{FFFFF}"] // ]; // constructor(config: Configuration) { // super("UNOY", config, UNOY.charset, true); // } // } // class UCS2 extends Charset { // // https://en.wikipedia.org/wiki/Universal_Coded_Character_Set // private static charset: string[][] = [ // ["\u0020", "\u002F"], // ["\u003A", "\u007E"], // ["\u00A0", "\uD800"], // ["\uE000", "\uFFFF"] // ]; // constructor(config: Configuration) { // super("UCS2", config, UCS2.charset, true); // } // } // class KECA extends Charset { // private static charset: string[][] = [ // // . , – ( ) / = ! ” % & * ; < > // ["\u0021", "\u0022"], // ! " // ["\u0041", "\u005A"], // A-Z // ["\u0025", "\u0026"], // % & // ["\u0028", "\u002A"], // ( ) * // ["\u002B", "\u002F"], // , - . / // ["\u003B", "\u003E"] // ; < = > // // Korean Syllables (2350 characters) // // Korean Hanja (4888 characters) // // Korean Alphabets // // Characters and numbers enclosed in a circle // // The length of the strings are counted by bytes instead of characters. // // So if you have a data element of length 3, you can have 3 latin characters, // // 1 Korean character or 1 Korean and 1 Latin character! // ]; // constructor(config: Configuration) { // super("KECA", config, KECA.charset); // } // } export class Tokenizer { regexes; regex; buffer; alpha() { this.regex = this.regexes.alpha; } alphanumeric() { this.regex = this.regexes.alphanumeric; } numeric() { this.regex = this.regexes.numeric; } decimal(chunk, index) { let result = '.'; switch(this.regex){ case this.regexes.numeric: this.regex = this.regexes.decimal; break; case this.regexes.alpha: case this.regexes.alphanumeric: result = chunk.charAt(index); break; case this.regexes.decimal: throw this.errors.secondDecimalMark(); } this.buffer += result; } constructor(config){ this.regexes = this.setCharsetBasedOnConfig(config); this.regex = this.regexes.alphanumeric; this.buffer = ''; } setCharsetBasedOnConfig(config) { switch(config.charset){ case 'UNOA': // ISO 646 without lowercase letters and a couple of symbols this.regexes = new UNOA(config); break; case 'UNOB': // ISO 646 this.regexes = new UNOB(config); break; case 'UNOC': // ISO 8859-1: Latin alphabet No. 1 this.regexes = new UNOC(config); break; case 'UNOD': // ISO 8859-2: Latin alphabet No. 2 this.regexes = new UNOD(config); break; case 'UNOE': // ISO 8859-5: Latin/Cyrillic alphabet this.regexes = new UNOE(config); break; case 'UNOF': // ISO 8859-7: Latin/Greek alphabet this.regexes = new UNOF(config); break; case 'UNOG': // ISO 8859-3: Latin alphabet this.regexes = new UNOG(config); break; case 'UNOH': // ISO 8859-4: Latin alphabet this.regexes = new UNOH(config); break; case 'UNOI': // ISO 8859-6: Latin/Arabic alphabet this.regexes = new UNOI(config); break; case 'UNOJ': // ISO 8859-8: Latin/Hebrew alphabet this.regexes = new UNOJ(config); break; case 'UNOK': // ISO 8859-9: Latin alphabet this.regexes = new UNOK(config); break; // TODO: // case "UNOX": // // ISO 2022-JP: Japanese; escape techniques in accordance with ISO 2375 // break; // case "UNOY": // // ISO 10646-1 without code extension technique // break; // case "KECA": // break; default: throw new Error(`Unsupported charset encoding '${config.charset}'`); } return this.regexes; } segment(chunk, index) { let code; // Read segment name data from the buffer const start = index; // Consume available ASCII uppercase characters while((code = chunk.charCodeAt(index) || 0) < 91 && code > 64){ index++; } this.buffer += chunk.slice(start, index); return index; } data(chunk, index) { this.regex.lastIndex = index; this.regex.test(chunk); this.buffer += chunk.slice(index, this.regex.lastIndex); return this.regex.lastIndex; } release(chunk, index) { this.buffer += chunk.charAt(index); } length() { return this.buffer.length - (this.regex === this.regexes.decimal ? 1 : 0); } content() { return this.buffer; } errors = { secondDecimalMark: ()=>new Error('Cannot accept a second decimal mark while parsing a number') }; } //# sourceMappingURL=tokenizer.js.map