@atomic-ehr/ucum
Version:
TypeScript implementation of UCUM (Unified Code for Units of Measure)
160 lines (143 loc) • 5.03 kB
text/typescript
export type TokenType =
| 'ATOM'
| 'PREFIX'
| 'DIGIT'
| 'DOT'
| 'SLASH'
| 'LPAREN'
| 'RPAREN'
| 'LBRACE'
| 'RBRACE'
| 'CARET'
| 'PLUS'
| 'MINUS'
| 'STAR'
| 'EOF';
export interface Token {
type: TokenType;
value: string;
position: number;
length: number;
}
export class Lexer {
private input: string;
private position: number = 0;
private tokens: Token[] = [];
constructor(input: string) {
this.input = input;
}
tokenize(): Token[] {
while (this.position < this.input.length) {
this.skipWhitespace();
if (this.position >= this.input.length) break;
const char = this.input[this.position];
if (char === '.') {
this.addToken('DOT', char);
} else if (char === '/') {
this.addToken('SLASH', char);
} else if (char === '(') {
this.addToken('LPAREN', char);
} else if (char === ')') {
this.addToken('RPAREN', char);
} else if (char === '{') {
this.addToken('LBRACE', char);
} else if (char === '}') {
this.addToken('RBRACE', char);
} else if (char === '^') {
this.addToken('CARET', char);
} else if (char === '+') {
this.addToken('PLUS', char);
} else if (char === '-') {
this.addToken('MINUS', char);
} else if (char === '*') {
this.addToken('STAR', char);
} else if (char && this.isDigit(char)) {
this.readNumber();
} else if (char && (this.isLetter(char) || char === '[' || char === '%' || char === "'")) {
this.readAtomOrPrefix();
} else {
throw new Error(`Unexpected character '${char}' at position ${this.position}`);
}
}
this.tokens.push({ type: 'EOF', value: '', position: this.position, length: 0 });
return this.tokens;
}
private skipWhitespace(): void {
while (this.position < this.input.length) {
const char = this.input[this.position];
if (!char || !/\s/.test(char)) break;
this.position++;
}
}
private isDigit(char: string | undefined): boolean {
if (!char) return false;
return /[0-9]/.test(char);
}
private isLetter(char: string | undefined): boolean {
if (!char) return false;
return /[a-zA-Z]/.test(char);
}
private readNumber(): void {
const start = this.position;
while (this.position < this.input.length && this.input[this.position] && this.isDigit(this.input[this.position])) {
this.position++;
}
const value = this.input.slice(start, this.position);
this.tokens.push({ type: 'DIGIT', value, position: start, length: value.length });
}
private readAtomOrPrefix(): void {
const start = this.position;
let value = '';
const char = this.input[this.position];
if (!char) return;
// Handle special cases like [, %, '
if (char === '[') {
// Read until matching ]
value += this.input[this.position++];
while (this.position < this.input.length && this.input[this.position] !== ']') {
value += this.input[this.position++];
}
if (this.position < this.input.length && this.input[this.position] === ']') {
value += this.input[this.position++];
}
} else if (char === '%') {
value = char;
this.position++;
// Check if % is followed by a bracketed annotation like %[slope]
if (this.position < this.input.length && this.input[this.position] === '[') {
value += this.input[this.position++];
while (this.position < this.input.length && this.input[this.position] !== ']') {
value += this.input[this.position++];
}
if (this.position < this.input.length && this.input[this.position] === ']') {
value += this.input[this.position++];
}
}
} else if (char === "'") {
value = char;
this.position++;
} else {
// Read letters
while (this.position < this.input.length && this.input[this.position] && (this.isLetter(this.input[this.position]) || this.input[this.position] === '_')) {
value += this.input[this.position++];
}
// Check if this is followed by a bracketed annotation like B[W] or B[SPL]
if (this.position < this.input.length && this.input[this.position] === '[') {
// This is part of the same atom
value += this.input[this.position++];
while (this.position < this.input.length && this.input[this.position] !== ']') {
value += this.input[this.position++];
}
if (this.position < this.input.length && this.input[this.position] === ']') {
value += this.input[this.position++];
}
}
}
// For now, mark everything as ATOM - the parser will disambiguate
this.tokens.push({ type: 'ATOM', value, position: start, length: value.length });
}
private addToken(type: TokenType, value: string): void {
this.tokens.push({ type, value, position: this.position, length: value.length });
this.position++;
}
}