UNPKG

@makakwastaken/ts-edifact

Version:

Edifact parser library

github.com/MakakWasTaken/ts-edifact/README.md

MakakWasTaken/ts-edifact

649 lines • 17.9 kB

JavaScript

/** * @author Roman Vottner * @copyright 2020 Roman Vottner * @license Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { Separators } from './edi/separators'; export class Charset { name; alpha; alphanumeric; numeric; decimal; constructor(name, configuration, admissibleAlphabet, unicode = false) { this.name = name; const exclude = configuration.delimiters(); const alphas = this.compile(admissibleAlphabet, exclude, unicode); this.alpha = alphas[0]; this.alphanumeric = alphas[1]; // parsing decimals is a multiple step process. First, the numeric part will be parsed, then the decimal separator // added and last the decimal part of the value added to the end of that value. So no need to catch the decimal // value with a regular expression actually if (unicode) { this.numeric = /[-]?[\p{Nd}]*/gu; this.decimal = /[\p{Nd}]*/gu; } else { this.numeric = /[-]?[0-9]*/g; this.decimal = /[0-9]*/g; } } compile(admissibleAlphabet, excludes, unicode = false) { // String.fromCharCode(parseInt("\u002F".codePointAt(0).toString(16), 16)) --> '/' const flag = unicode ? 'gu' : 'g'; let output = ''; for (const seq of admissibleAlphabet) { if (seq.length > 1) { const start = seq[0].codePointAt(0); // '/' --> 47 as the 47 character in the codepage const end = seq[1].codePointAt(0); if (start && end) { for (let i = start; i <= end; i++) { if (!excludes.includes(i)) { output += Separators.escapeIfNeeded(String.fromCodePoint(i)); } } } } else { const idx = seq[0].codePointAt(0); if (idx) { output += Separators.escapeIfNeeded(String.fromCodePoint(idx)); } } } const ret = []; ret.push(new RegExp(`[${output}]*`, flag)); ret.push(new RegExp(`[0-9${output}]*`, flag)); return ret; } } /* enum Modes { alphanumeric = 0, alpha = 1, numeric = 2, decimal = 3 } */ class UNOA extends Charset { static charset = [ ['\u0020'], // (space) ['\u0028', '\u0029'], // ( ) ['\u002C', '\u002F'], // , - . / ['\u003D'], // = ['\u0041', '\u005A'], // A-Z ]; constructor(config) { super('UNOA', config, UNOA.charset); } } class UNOB extends Charset { static charset = [ ['\u0020', '\u0022'], // (space) ! " ['\u0041', '\u005A'], // A-Z ['\u0025', '\u002F'], // % & ' ( ) * + , - . / ['\u003A', '\u003F'], // : ; < = > ? ['\u0061', '\u007A'], // a-z ['\u005E', '\u005F'], // ^ _ ]; constructor(config) { super('UNOB', config, UNOB.charset); } } class UNOC extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Codepage_layout static charset = [ ['\u0020', '\u002F'], // basic symbols and punctuation characters ['\u003A', '\u007E'], // basic characters ['\u00A0', '\u00FF'], // special characters ]; constructor(config) { super('UNOC', config, UNOC.charset); } } class UNOD extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-2 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u0104'], ['\u02D8'], ['\u0141'], ['\u00A4'], ['\u013D'], ['\u015A'], ['\u00A7'], ['\u0048'], ['\u0160'], ['\u015E'], ['\u0164'], ['\u0179'], ['\u00AD'], ['\u017D'], ['\u017B'], ['\u00B0'], ['\u0105'], ['\u02DB'], ['\u0142'], ['\u00B4'], ['\u013E'], ['\u015B'], ['\u02C7'], ['\u00B8'], ['\u0161'], ['\u015F'], ['\u0165'], ['\u017A'], ['\u02DD'], ['\u017E'], ['\u017C'], ['\u0154'], ['\u00C1'], ['\u00C2'], ['\u0102'], ['\u00C4'], ['\u0139'], ['\u0106'], ['\u00C7'], ['\u010C'], ['\u00C9'], ['\u0118'], ['\u00CB'], ['\u011A'], ['\u00CD'], ['\u00CE'], ['\u010E'], ['\u0110'], ['\u0143'], ['\u0147'], ['\u00D3'], ['\u00D4'], ['\u0150'], ['\u00D6'], ['\u00D7'], ['\u0158'], ['\u016E'], ['\u00DA'], ['\u0170'], ['\u00DC'], ['\u00DD'], ['\u0162'], ['\u00DF'], ['\u0155'], ['\u00E1'], ['\u00E2'], ['\u0103'], ['\u00E4'], ['\u013A'], ['\u0107'], ['\u00E7'], ['\u010D'], ['\u00E9'], ['\u0119'], ['\u00EB'], ['\u011B'], ['\u00ED'], ['\u00EE'], ['\u010F'], ['\u0111'], ['\u0144'], ['\u0148'], ['\u00F3'], ['\u00F4'], ['\u0151'], ['\u00F6'], ['\u00F7'], ['\u0159'], ['\u016F'], ['\u00FA'], ['\u0171'], ['\u00FC'], ['\u00FD'], ['\u0163'], ['\u02D9'], ]; constructor(config) { super('UNOD', config, UNOD.charset); } } class UNOE extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-5 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u0401', '\u040C'], ['\u00AD'], ['\u040E', '\u044F'], // spans 5 lines ['\u2116'], ['\u0451', '\u045C'], ['\u00A7'], ['\u045E'], ['\u045F'], ]; constructor(config) { super('UNOE', config, UNOE.charset); } } class UNOF extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-7 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u2018', '\u2019'], ['\u00A3'], ['\u20AC'], ['\u20AF'], ['\u00A6', '\u00A9'], ['\u037A'], ['\u00AB', '\u00AD'], ['\u2015'], ['\u00B0', '\u00B3'], ['\u0384', '\u0386'], ['\u00B7'], ['\u0388', '\u038A'], ['\u00BB'], ['\u038C', '\u03A1'], ['\u03A3', '\u03CE'], ]; constructor(config) { super('UNOF', config, UNOF.charset); } } class UNOG extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-3 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u0126'], ['\u02D8'], ['\u00A3'], ['\u00A4'], ['\u0124'], ['\u00A7'], ['\u00A8'], ['\u0130'], ['\u015E'], ['\u011E'], ['\u0134'], ['\u00AD'], ['\u017B'], ['\u00B0'], ['\u0127'], ['\u00B2'], ['\u00B3', '\u00B5'], ['\u0125'], ['\u00B7'], ['\u00B8'], ['\u0131'], ['\u015F'], ['\u011F'], ['\u0135'], ['\u00BD'], ['\u017C'], ['\u00C0', '\u00C2'], ['\u00C4'], ['\u010A'], ['\u0108'], ['\u00C7', '\u00CF'], ['\u00D1', '\u00D4'], ['\u0120'], ['\u00D6'], ['\u00D7'], ['\u011C'], ['\u00D9', '\u00DC'], ['\u016C'], ['\u015C'], ['\u00DF', '\u00E2'], ['\u00E4'], ['\u010B'], ['\u0109'], ['\u00E7', '\u00EF'], ['\u00F1', '\u00F4'], ['\u0121'], ['\u00F6'], ['\u00F7'], ['\u011D'], ['\u00F9', '\u00FC'], ['\u016D'], ['\u015D'], ['\u02D9'], ]; constructor(config) { super('UNOG', config, UNOG.charset); } } class UNOH extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-4 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u0104'], ['\u0138'], ['\u0156'], ['\u00A4'], ['\u0128'], ['\u013B'], ['\u00A7'], ['\u00A8'], ['\u0160'], ['\u0112'], ['\u0122'], ['\u0166'], ['\u00AD'], ['\u017D'], ['\u00AF'], ['\u00B0'], ['\u0105'], ['\u02DB'], ['\u0157'], ['\u00B4'], ['\u0129'], ['\u013C'], ['\u02C7'], ['\u00B8'], ['\u0161'], ['\u0113'], ['\u0123'], ['\u0167'], ['\u014A'], ['\u017E'], ['\u014B'], ['\u0100'], ['\u00C1', '\u00C6'], ['\u012E'], ['\u010C'], ['\u00C9'], ['\u0118'], ['\u00CB'], ['\u0116'], ['\u00CD'], ['\u00CE'], ['\u012A'], ['\u0110'], ['\u0145'], ['\u014C'], ['\u0136'], ['\u00D4', '\u00D8'], ['\u0172'], ['\u00DA', '\u00DC'], ['\u0168'], ['\u016A'], ['\u00DF'], ['\u0101'], ['\u00E1', '\u00E6'], ['\u012F'], ['\u010D'], ['\u00E9'], ['\u0119'], ['\u00EB'], ['\u0117'], ['\u00ED'], ['\u00EE'], ['\u0128'], ['\u0111'], ['\u0146'], ['\u014D'], ['\u0137'], ['\u00F4', '\u00F8'], ['\u0173'], ['\u00FA', '\u00FC'], ['\u0169'], ['\u0168'], ['\u02D9'], ]; constructor(config) { super('UNOH', config, UNOH.charset); } } class UNOI extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-6 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u00A4'], ['\u060C'], ['\u00AD'], ['\u061B'], ['\u061F'], ['\u0621', '\u063A'], ['\u0640', '\u0652'], ]; constructor(config) { super('UNOI', config, UNOI.charset); } } class UNOJ extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-8 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0'], ['\u00A2', '\u00A9'], ['\u00D7'], ['\u00AB', '\u00B9'], ['\u00F7'], ['\u00BB', '\u00BE'], ['\u2017'], ['\u05D0', '\u05EA'], ['\u200E'], ['\u200F'], ]; constructor(config) { super('UNOJ', config, UNOJ.charset); } } class UNOK extends Charset { // https://en.wikipedia.org/wiki/ISO/IEC_8859-9 static charset = [ ['\u0020', '\u002F'], ['\u003A', '\u007E'], ['\u00A0', '\u00CF'], // covers 3 lines ['\u011E'], ['\u00D1', '\u00DC'], ['\u0130'], ['\u015E'], ['\u00DF', '\u00EF'], // covers 2 lines ['\u011F'], ['\u00F1', '\u00FC'], ['\u0131'], ['\u015F'], ['\u00FF'], ]; constructor(config) { super('UNOK', config, UNOK.charset); } } // // TODO // class UNOX extends Charset { // // https://en.wikipedia.org/wiki/ISO/IEC_2022#ISO-2022-JP // private static charset: string[][] = []; // constructor(config: Configuration) { // super("UNOX", config, UNOX.charset); // } // } // class UNOY extends Charset { // // https://en.wikipedia.org/wiki/Universal_Coded_Character_Set // private static charset: string[][] = [ // ["\u0020", "\u002F"], // ["\u003A", "\u007E"], // ["\u00A0", "\u{FFFFF}"] // ]; // constructor(config: Configuration) { // super("UNOY", config, UNOY.charset, true); // } // } // class UCS2 extends Charset { // // https://en.wikipedia.org/wiki/Universal_Coded_Character_Set // private static charset: string[][] = [ // ["\u0020", "\u002F"], // ["\u003A", "\u007E"], // ["\u00A0", "\uD800"], // ["\uE000", "\uFFFF"] // ]; // constructor(config: Configuration) { // super("UCS2", config, UCS2.charset, true); // } // } // class KECA extends Charset { // private static charset: string[][] = [ // // . , – ( ) / = ! ” % & * ; < > // ["\u0021", "\u0022"], // ! " // ["\u0041", "\u005A"], // A-Z // ["\u0025", "\u0026"], // % & // ["\u0028", "\u002A"], // ( ) * // ["\u002B", "\u002F"], // , - . / // ["\u003B", "\u003E"] // ; < = > // // Korean Syllables (2350 characters) // // Korean Hanja (4888 characters) // // Korean Alphabets // // Characters and numbers enclosed in a circle // // The length of the strings are counted by bytes instead of characters. // // So if you have a data element of length 3, you can have 3 latin characters, // // 1 Korean character or 1 Korean and 1 Latin character! // ]; // constructor(config: Configuration) { // super("KECA", config, KECA.charset); // } // } export class Tokenizer { regexes; regex; buffer; alpha() { this.regex = this.regexes.alpha; } alphanumeric() { this.regex = this.regexes.alphanumeric; } numeric() { this.regex = this.regexes.numeric; } decimal(chunk, index) { let result = '.'; switch (this.regex) { case this.regexes.numeric: this.regex = this.regexes.decimal; break; case this.regexes.alpha: case this.regexes.alphanumeric: result = chunk.charAt(index); break; case this.regexes.decimal: throw this.errors.secondDecimalMark(); } this.buffer += result; } constructor(config) { this.regexes = this.setCharsetBasedOnConfig(config); this.regex = this.regexes.alphanumeric; this.buffer = ''; } setCharsetBasedOnConfig(config) { switch (config.charset) { case 'UNOA': // ISO 646 without lowercase letters and a couple of symbols this.regexes = new UNOA(config); break; case 'UNOB': // ISO 646 this.regexes = new UNOB(config); break; case 'UNOC': // ISO 8859-1: Latin alphabet No. 1 this.regexes = new UNOC(config); break; case 'UNOD': // ISO 8859-2: Latin alphabet No. 2 this.regexes = new UNOD(config); break; case 'UNOE': // ISO 8859-5: Latin/Cyrillic alphabet this.regexes = new UNOE(config); break; case 'UNOF': // ISO 8859-7: Latin/Greek alphabet this.regexes = new UNOF(config); break; case 'UNOG': // ISO 8859-3: Latin alphabet this.regexes = new UNOG(config); break; case 'UNOH': // ISO 8859-4: Latin alphabet this.regexes = new UNOH(config); break; case 'UNOI': // ISO 8859-6: Latin/Arabic alphabet this.regexes = new UNOI(config); break; case 'UNOJ': // ISO 8859-8: Latin/Hebrew alphabet this.regexes = new UNOJ(config); break; case 'UNOK': // ISO 8859-9: Latin alphabet this.regexes = new UNOK(config); break; // TODO: // case "UNOX": // // ISO 2022-JP: Japanese; escape techniques in accordance with ISO 2375 // break; // case "UNOY": // // ISO 10646-1 without code extension technique // break; // case "KECA": // break; default: throw new Error(`Unsupported charset encoding '${config.charset}'`); } return this.regexes; } segment(chunk, index) { let code; // Read segment name data from the buffer const start = index; // Consume available ASCII uppercase characters while ((code = chunk.charCodeAt(index) || 0) < 91 && code > 64) { index++; } this.buffer += chunk.slice(start, index); return index; } data(chunk, index) { this.regex.lastIndex = index; this.regex.test(chunk); this.buffer += chunk.slice(index, this.regex.lastIndex); return this.regex.lastIndex; } release(chunk, index) { this.buffer += chunk.charAt(index); } length() { return this.buffer.length - (this.regex === this.regexes.decimal ? 1 : 0); } content() { return this.buffer; } errors = { secondDecimalMark: () => new Error('Cannot accept a second decimal mark while parsing a number'), }; } //# sourceMappingURL=tokenizer.js.map