@dhiraj2105/dlang
Version:
A basic programming language built for learning and experimenting
101 lines (85 loc) • 3.15 kB
JavaScript
// ==========================
// lexer.js
// This file converts source code into a list of tokens
// Each token is a meaningful unit: keywords, variables, numbers, operators, etc.
//
// Example:
// Input : let x = 5 + 10
// Output: [ { type: 'KEYWORD', value: 'let' }, { type: 'IDENTIFIER', value: 'x' }, ... ]
//
// Each part of the code is matched using regex
// ==========================
// List of reserved keywords in dlang
const KEYWORDS = [
"let",
"print",
"if",
"else",
"while",
"break",
"continue",
"function",
"dede",
];
// Define token types using regular expressions
const TOKEN_TYPES = [
{ type: "WHITESPACE", regex: /^\s+/ }, // Matches spaces and tabs
{ type: "NUMBER", regex: /^\d+/ }, // Matches integers
{ type: "STRING", regex: /^"([^"]*)"/ }, // Matches string literals
{ type: "OPERATOR", regex: /^[+\-*/=<>!]+/ }, // Matches operators like +, -, *, =, etc.
{ type: "PUNCTUATION", regex: /^[\(\)\{\};,]/ }, // Matches punctuation marks
{ type: "IDENTIFIER", regex: /^[a-zA-Z_][a-zA-Z0-9_]*/ }, // Matches variable names and keywords
];
/**
* Tokenizes the source code into an array of token objects
* Each token has a type and a value
*
* @param {string} input - The source code to tokenize
* @returns {Array} - An array of tokens
*/
export const tokenize = (input) => {
const tokens = []; // Array to store extracted tokens
const lines = input.split("\n"); // Split the entire code into lines
// Process each line individually
for (let line of lines) {
// Remove inline comments (everything after #)
if (line.includes("#")) {
line = line.split("#")[0];
}
// Trim whitespace and skip if line is empty after removing comment
line = line.trim();
if (line === "") continue;
let code = line; // Copy line into mutable variable
// Loop through characters in the line until fully processed
while (code.length > 0) {
let matched = false; // Flag to track if any token matched
// Try matching each token type in order
for (const { type, regex } of TOKEN_TYPES) {
const match = regex.exec(code); // Try to match the regex
if (match) {
matched = true; // Set matched flag to true
const value = match[0]; // Get matched string
// Skip whitespace
if (type !== "WHITESPACE") {
let tokenType = type;
// Change type to KEYWORD if the identifier matches a reserved word
if (type === "IDENTIFIER" && KEYWORDS.includes(value)) {
tokenType = "KEYWORD";
}
// Push the valid token into the token array
tokens.push({ type: tokenType, value });
}
// Remove the matched part from the code string
code = code.slice(value.length);
break; // Break inner loop and try next part
}
}
// If no token matched, throw an error
if (!matched) {
console.error(`Failed to tokenize: ${code}`);
throw new Error(`Unexpected token near: ${code.slice(0, 10)}`);
}
}
}
return tokens; // Return the array of tokens
};