UNPKG

@atomic-ehr/ucum

Version:

TypeScript implementation of UCUM (Unified Code for Units of Measure)

143 lines (142 loc) 5.36 kB
export class Lexer { input; position = 0; tokens = []; constructor(input) { this.input = input; } tokenize() { while (this.position < this.input.length) { this.skipWhitespace(); if (this.position >= this.input.length) break; const char = this.input[this.position]; if (char === '.') { this.addToken('DOT', char); } else if (char === '/') { this.addToken('SLASH', char); } else if (char === '(') { this.addToken('LPAREN', char); } else if (char === ')') { this.addToken('RPAREN', char); } else if (char === '{') { this.addToken('LBRACE', char); } else if (char === '}') { this.addToken('RBRACE', char); } else if (char === '^') { this.addToken('CARET', char); } else if (char === '+') { this.addToken('PLUS', char); } else if (char === '-') { this.addToken('MINUS', char); } else if (char === '*') { this.addToken('STAR', char); } else if (char && this.isDigit(char)) { this.readNumber(); } else if (char && (this.isLetter(char) || char === '[' || char === '%' || char === "'")) { this.readAtomOrPrefix(); } else { throw new Error(`Unexpected character '${char}' at position ${this.position}`); } } this.tokens.push({ type: 'EOF', value: '', position: this.position, length: 0 }); return this.tokens; } skipWhitespace() { while (this.position < this.input.length) { const char = this.input[this.position]; if (!char || !/\s/.test(char)) break; this.position++; } } isDigit(char) { if (!char) return false; return /[0-9]/.test(char); } isLetter(char) { if (!char) return false; return /[a-zA-Z]/.test(char); } readNumber() { const start = this.position; while (this.position < this.input.length && this.input[this.position] && this.isDigit(this.input[this.position])) { this.position++; } const value = this.input.slice(start, this.position); this.tokens.push({ type: 'DIGIT', value, position: start, length: value.length }); } readAtomOrPrefix() { const start = this.position; let value = ''; const char = this.input[this.position]; if (!char) return; // Handle special cases like [, %, ' if (char === '[') { // Read until matching ] value += this.input[this.position++]; while (this.position < this.input.length && this.input[this.position] !== ']') { value += this.input[this.position++]; } if (this.position < this.input.length && this.input[this.position] === ']') { value += this.input[this.position++]; } } else if (char === '%') { value = char; this.position++; // Check if % is followed by a bracketed annotation like %[slope] if (this.position < this.input.length && this.input[this.position] === '[') { value += this.input[this.position++]; while (this.position < this.input.length && this.input[this.position] !== ']') { value += this.input[this.position++]; } if (this.position < this.input.length && this.input[this.position] === ']') { value += this.input[this.position++]; } } } else if (char === "'") { value = char; this.position++; } else { // Read letters while (this.position < this.input.length && this.input[this.position] && (this.isLetter(this.input[this.position]) || this.input[this.position] === '_')) { value += this.input[this.position++]; } // Check if this is followed by a bracketed annotation like B[W] or B[SPL] if (this.position < this.input.length && this.input[this.position] === '[') { // This is part of the same atom value += this.input[this.position++]; while (this.position < this.input.length && this.input[this.position] !== ']') { value += this.input[this.position++]; } if (this.position < this.input.length && this.input[this.position] === ']') { value += this.input[this.position++]; } } } // For now, mark everything as ATOM - the parser will disambiguate this.tokens.push({ type: 'ATOM', value, position: start, length: value.length }); } addToken(type, value) { this.tokens.push({ type, value, position: this.position, length: value.length }); this.position++; } }