UNPKG

@islah/math-emo

Version:

MATH-EMO is a node js project to abstract the high level code to asm part, so people can easily create their own programming language, see readme for more details.

443 lines (406 loc) 11.3 kB
// Define the Utilities module function createUtilities(logs) { return { log: function (data, type) { if (type == "all") { if (!Array.isArray(logs.all)) { logs.all = []; } logs.all.push(data); return data; } if (!Array.isArray(logs[type])) { logs[type] = []; } logs[type].push(data); if (!Array.isArray(logs.all)) { logs.all = []; } logs.all.push(data); return data; }, buildTrie: function (hardcodedList) { const root = {}; for (const item of hardcodedList) { const val = typeof item === "string" ? item : item.value; const type = typeof item === "string" ? "token" : item.type || "token"; let node = root; for (let char of val) { if (!node[char]) node[char] = {}; node = node[char]; } node._end = { value: val, type: type }; } return root; }, matchFromTrie: function (trie, str, index) { let node = trie; let lastMatch = null; for (let i = index; i < str.length; i++) { const char = str[i]; if (!node[char]) break; node = node[char]; if (node._end) { lastMatch = node._end; } } return lastMatch; }, }; } let Logs = {}; let Utilities = createUtilities(Logs); const GlobalCache = (function () { let cache = new Map(); Utilities.log("GlobalCache initialized", "info"); return { get: (key) => { const result = cache.get(key); Utilities.log( `Cache GET: ${key} -> ${result ? "HIT" : "MISS"}`, "verbose" ); return result; }, set: (key, value) => { Utilities.log(`Cache SET: ${key}`, "verbose"); return cache.set(key, value); }, has: (key) => { const result = cache.has(key); Utilities.log(`Cache HAS: ${key} -> ${result}`, "verbose"); return result; }, clear: () => { Utilities.log("Cache cleared", "info"); return cache.clear(); }, }; })(); // Rule functions for dynamic token matching function whitespaceRule(str, index) { const char = str[index]; if (char !== " " && char !== "\t" && char !== "\r") return null; let len = 0; while ( index + len < str.length && (str[index + len] === " " || str[index + len] === "\t" || str[index + len] === "\r") ) { len++; } return { skip: true, length: len }; } function commentRule(str, index) { if (str.substr(index, 2) !== "//") return null; let len = 2; while (index + len < str.length && str[index + len] !== "\n") { len++; } return { skip: true, length: len }; } function stringRule(str, index) { const quote = str[index]; if (quote !== '"' && quote !== "'") return null; let len = 1; let value = quote; while (index + len < str.length) { const char = str[index + len]; if (char === quote) { value += char; len++; break; } if (char === "\\" && index + len + 1 < str.length) { value += char + str[index + len + 1]; len += 2; } else { value += char; len++; } } if (len > 1 && str[index + len - 1] === quote) { return { type: "string", value, length: len }; } return null; // Unclosed string } function identifierRule(str, index) { const char = str[index]; if ( !( (char >= "A" && char <= "Z") || (char >= "a" && char <= "z") || char === "_" ) ) return null; let len = 1; while ( index + len < str.length && ((str[index + len] >= "A" && str[index + len] <= "Z") || (str[index + len] >= "a" && str[index + len] <= "z") || (str[index + len] >= "0" && str[index + len] <= "9") || str[index + len] === "_") ) { len++; } const value = str.substr(index, len); return { type: "identifier", value, length: len }; } function numberRule(str, index) { const char = str[index]; if (!(char >= "0" && char <= "9")) return null; let len = 0; let hasDot = false; while ( index + len < str.length && ((str[index + len] >= "0" && str[index + len] <= "9") || str[index + len] === ".") ) { if (str[index + len] === ".") { if (hasDot) return null; // More than one dot is invalid hasDot = true; } len++; } const value = str.substr(index, len); return { type: "number", value, length: len }; } var create_Tokenizer = function () { Utilities.log("Creating tokenizer", "info"); function CacheTokenOBJ(tokenOBJ) { Utilities.log("Caching token object", "info"); const cacheKey = JSON.stringify(tokenOBJ); if (GlobalCache.has(cacheKey)) { Utilities.log("Using cached token object", "info"); return GlobalCache.get(cacheKey); } Utilities.log("Building new cached token object", "info"); let Cached = { rules: [], trie: Utilities.buildTrie(tokenOBJ.hardcoded || []), hardcoded: {}, }; for (const item of tokenOBJ.hardcoded || []) { const val = typeof item === "string" ? item : item.value; const type = typeof item === "string" ? "token" : item.type || "token"; Cached.hardcoded[val] = type; } for (const rule of tokenOBJ.rules || []) { if (typeof rule === "function") { Cached.rules.push({ fn: rule, priority: 0 }); Utilities.log(`Added rule function with priority 0`, "verbose"); } else if (rule && typeof rule.fn === "function") { Cached.rules.push(rule); Utilities.log( `Added rule function with priority ${rule.priority || 0}`, "verbose" ); } } Cached.rules.sort((a, b) => (b.priority || 0) - (a.priority || 0)); Utilities.log(`Sorted ${Cached.rules.length} rules by priority`, "info"); GlobalCache.set(cacheKey, Cached); return Cached; } const HARDCODED_PRIORITY = 1000; function tokenize(str, tokensOBJ) { Utilities.log(`Starting tokenization of string: "${str}"`, "info"); let Cache = CacheTokenOBJ(tokensOBJ); let tokens = []; let STRindex = 0; let LOGs = []; while (STRindex < str.length) { Utilities.log( `Tokenizing at index ${STRindex}: "${str[STRindex]}"`, "verbose" ); let bestMatch = null; let bestPri = -Infinity; // Check trie match const trieMatch = Utilities.matchFromTrie(Cache.trie, str, STRindex); if (trieMatch) { const match = { type: trieMatch.type, value: trieMatch.value, length: trieMatch.value.length, priority: HARDCODED_PRIORITY, }; if (match.priority > bestPri) { bestMatch = match; bestPri = match.priority; } } // Check rules for (let rule of Cache.rules) { const result = rule.fn(str, STRindex); if (result) { let pri = rule.priority || 0; if (tokensOBJ.easeFN) { pri = tokensOBJ.easeFN( str, STRindex, STRindex + result.length, result.length, pri ); } if (pri > bestPri) { bestMatch = { ...result, priority: pri }; bestPri = pri; } } } if (bestMatch) { if (bestMatch.skip) { STRindex += bestMatch.length; continue; } else { const token = { type: bestMatch.type, value: bestMatch.value }; tokens.push(token); Utilities.log(`Token created: ${JSON.stringify(token)}`, "info"); STRindex += bestMatch.length; continue; } } // If no match, treat as unknown let chunk = str[STRindex]; const unknownToken = { type: "unknown", value: chunk }; tokens.push(unknownToken); const warning = "Unrecognized character: " + chunk; LOGs.push({ warn: warning }); Utilities.log(warning, "warn"); Utilities.log( `Unknown token created: ${JSON.stringify(unknownToken)}`, "warn" ); STRindex++; } LOGs.push({ info: "tokenization completed" }); Utilities.log( `Tokenization completed. Generated ${tokens.length} tokens`, "info" ); return { tokens, logs: LOGs }; } return { tokenize }; }; let tokenizer = create_Tokenizer(); let OAS_TOKobj = { hardcoded: [ { value: "\n", type: "newline" }, "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "-", "_", "=", "+", "{", "}", "[", "]", "|", "\\", ";", "'", '"', "<", ">", ",", ".", "/", "?", "`", "~", ":", "::", "@import", "import", ], rules: [ { fn: whitespaceRule, priority: 1100 }, { fn: commentRule, priority: 1100 }, { fn: stringRule, priority: 1100 }, { fn: identifierRule, priority: 1100 }, { fn: numberRule, priority: 1100 }, ], }; function cleanTheTokens(tokens) { let cleanedTokens = []; let currentValue = ""; function determineType(token) { return token.type; } for (let i = 0; i < tokens.length; i++) { const token = tokens[i]; if (token.value === " ") { if (currentValue !== "") { cleanedTokens.push({ type: determineType(token), value: currentValue, }); currentValue = ""; } continue; } else if (token.value.length > 1) { if (currentValue !== "") { cleanedTokens.push({ type: determineType(token), value: currentValue, }); currentValue = ""; } cleanedTokens.push(token); } else if ( token.value.length === 1 && /[a-zA-Z0-9_"'`]/.test(token.value) ) { currentValue += token.value; } else { if (currentValue !== "") { cleanedTokens.push({ type: determineType(token), value: currentValue, }); currentValue = ""; } cleanedTokens.push(token); } } if (currentValue !== "") { cleanedTokens.push({ type: determineType(tokens[tokens.length - 1]), value: currentValue, }); } return cleanedTokens; } function defaultTokenizer(code) { let tokens = cleanTheTokens(tokenizer.tokenize(code, OAS_TOKobj).tokens); // console.log("TOKENS", JSON.stringify(tokens, null, 2)); return tokens; } function create(globalPipeLineMEM) { return { defaultTokenizer: function () { if (!globalPipeLineMEM.pipelineData.rawText) { console.warn( "rawText data is invalid", globalPipeLineMEM.pipelineData.rawText ); return; } return defaultTokenizer(globalPipeLineMEM.pipelineData.rawText); }, }; } module.exports = { create };