@atomic-ehr/ucum
Version:
TypeScript implementation of UCUM (Unified Code for Units of Measure)
143 lines (142 loc) • 5.36 kB
JavaScript
export class Lexer {
input;
position = 0;
tokens = [];
constructor(input) {
this.input = input;
}
tokenize() {
while (this.position < this.input.length) {
this.skipWhitespace();
if (this.position >= this.input.length)
break;
const char = this.input[this.position];
if (char === '.') {
this.addToken('DOT', char);
}
else if (char === '/') {
this.addToken('SLASH', char);
}
else if (char === '(') {
this.addToken('LPAREN', char);
}
else if (char === ')') {
this.addToken('RPAREN', char);
}
else if (char === '{') {
this.addToken('LBRACE', char);
}
else if (char === '}') {
this.addToken('RBRACE', char);
}
else if (char === '^') {
this.addToken('CARET', char);
}
else if (char === '+') {
this.addToken('PLUS', char);
}
else if (char === '-') {
this.addToken('MINUS', char);
}
else if (char === '*') {
this.addToken('STAR', char);
}
else if (char && this.isDigit(char)) {
this.readNumber();
}
else if (char && (this.isLetter(char) || char === '[' || char === '%' || char === "'")) {
this.readAtomOrPrefix();
}
else {
throw new Error(`Unexpected character '${char}' at position ${this.position}`);
}
}
this.tokens.push({ type: 'EOF', value: '', position: this.position, length: 0 });
return this.tokens;
}
skipWhitespace() {
while (this.position < this.input.length) {
const char = this.input[this.position];
if (!char || !/\s/.test(char))
break;
this.position++;
}
}
isDigit(char) {
if (!char)
return false;
return /[0-9]/.test(char);
}
isLetter(char) {
if (!char)
return false;
return /[a-zA-Z]/.test(char);
}
readNumber() {
const start = this.position;
while (this.position < this.input.length && this.input[this.position] && this.isDigit(this.input[this.position])) {
this.position++;
}
const value = this.input.slice(start, this.position);
this.tokens.push({ type: 'DIGIT', value, position: start, length: value.length });
}
readAtomOrPrefix() {
const start = this.position;
let value = '';
const char = this.input[this.position];
if (!char)
return;
// Handle special cases like [, %, '
if (char === '[') {
// Read until matching ]
value += this.input[this.position++];
while (this.position < this.input.length && this.input[this.position] !== ']') {
value += this.input[this.position++];
}
if (this.position < this.input.length && this.input[this.position] === ']') {
value += this.input[this.position++];
}
}
else if (char === '%') {
value = char;
this.position++;
// Check if % is followed by a bracketed annotation like %[slope]
if (this.position < this.input.length && this.input[this.position] === '[') {
value += this.input[this.position++];
while (this.position < this.input.length && this.input[this.position] !== ']') {
value += this.input[this.position++];
}
if (this.position < this.input.length && this.input[this.position] === ']') {
value += this.input[this.position++];
}
}
}
else if (char === "'") {
value = char;
this.position++;
}
else {
// Read letters
while (this.position < this.input.length && this.input[this.position] && (this.isLetter(this.input[this.position]) || this.input[this.position] === '_')) {
value += this.input[this.position++];
}
// Check if this is followed by a bracketed annotation like B[W] or B[SPL]
if (this.position < this.input.length && this.input[this.position] === '[') {
// This is part of the same atom
value += this.input[this.position++];
while (this.position < this.input.length && this.input[this.position] !== ']') {
value += this.input[this.position++];
}
if (this.position < this.input.length && this.input[this.position] === ']') {
value += this.input[this.position++];
}
}
}
// For now, mark everything as ATOM - the parser will disambiguate
this.tokens.push({ type: 'ATOM', value, position: start, length: value.length });
}
addToken(type, value) {
this.tokens.push({ type, value, position: this.position, length: value.length });
this.position++;
}
}