@atomic-ehr/fhirpath
Version:
A TypeScript implementation of FHIRPath
881 lines (744 loc) • 26 kB
text/typescript
/**
* Simplified FHIRPath Lexer
*
* This lexer only recognizes:
* - Symbol operators: +, -, *, /, <, >, =, etc. (all as OPERATOR tokens)
* - Structural tokens: (, ), [, ], {, }, ., ,
* - Literals: numbers, strings, datetime, time
* - Identifiers: any alphabetic sequence (including all keywords)
* - Special identifiers: $... (context variables like $this, $index)
* - Environment variables: %identifier, %`delimited`, %'string'
*
* The parser is responsible for determining which identifiers are keyword operators.
*/
export enum TokenType {
// Special
EOF = 0,
// Literals
IDENTIFIER = 1,
NUMBER = 2,
STRING = 3,
DATETIME = 4,
TIME = 5,
QUANTITY = 6, // Quantity literals like 5 'mg'
// Operators (all symbol operators consolidated)
OPERATOR = 10, // +, -, *, /, <, >, <=, >=, =, !=, ~, !~, |, &
// Structural
DOT = 50, // .
COMMA = 51, // ,
LPAREN = 52, // (
RPAREN = 53, // )
LBRACKET = 54, // [
RBRACKET = 55, // ]
LBRACE = 56, // {
RBRACE = 57, // }
// Special tokens
SPECIAL_IDENTIFIER = 60, // $...
ENVIRONMENT_VARIABLE = 70, // %identifier, %`delimited`, %'string'
CURSOR = 71, // Virtual cursor token for LSP support
// Trivia tokens
WHITESPACE = 80,
LINE_COMMENT = 81,
BLOCK_COMMENT = 82,
}
export enum Channel {
DEFAULT = 0,
HIDDEN = 1,
}
import type { Position, Range } from './types';
export interface Token {
type: TokenType;
value: string;
start: number;
end: number;
line: number;
column: number;
range?: Range; // LSP-compatible range
channel?: Channel;
}
export interface LexerOptions {
trackPosition?: boolean;
preserveTrivia?: boolean;
}
export class Lexer {
private input: string;
private position: number = 0;
private line: number = 1; // Legacy: 1-based for backward compatibility
private column: number = 1; // Legacy: 1-based for backward compatibility
private lspLine: number = 0; // LSP: zero-based
private lspCharacter: number = 0; // LSP: zero-based character within line
private options: LexerOptions;
private lineOffsets: number[] = [0]; // Start positions of each line
constructor(input: string, options: LexerOptions = {}) {
this.input = input;
this.options = {
trackPosition: options.trackPosition ?? true,
preserveTrivia: options.preserveTrivia ?? false,
};
if (this.options.trackPosition) {
this.buildLineOffsets();
}
}
/**
* Build line offset map for efficient position conversions
*/
private buildLineOffsets(): void {
this.lineOffsets = [0];
for (let i = 0; i < this.input.length; i++) {
const char = this.input[i];
if (char === '\n') {
this.lineOffsets.push(i + 1);
} else if (char === '\r') {
// Handle \r\n as single line ending
if (i + 1 < this.input.length && this.input[i + 1] === '\n') {
i++; // Skip the \n
}
this.lineOffsets.push(i + 1);
}
}
}
tokenize(): Token[] {
const tokens: Token[] = [];
while (this.position < this.input.length) {
const token = this.nextToken();
if (token) {
tokens.push(token);
}
}
// Always add EOF token
tokens.push(this.createToken(TokenType.EOF, '', this.position, this.position));
return tokens;
}
private nextToken(): Token | null {
// Handle whitespace if preserveTrivia is enabled
if (this.options.preserveTrivia && this.position < this.input.length) {
const wsStart = this.position;
const wsStartLine = this.line;
const wsStartColumn = this.column;
if (this.isWhitespace(this.current())) {
this.skipWhitespace();
const wsToken = this.createToken(
TokenType.WHITESPACE,
this.input.substring(wsStart, this.position),
wsStart,
this.position,
wsStartLine,
wsStartColumn
);
wsToken.channel = Channel.HIDDEN;
return wsToken;
}
} else {
// Skip whitespace normally
this.skipWhitespace();
}
if (this.position >= this.input.length) {
return null;
}
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
const char = this.input[this.position];
const charCode = this.input.charCodeAt(this.position);
// Single character tokens
switch (char) {
case '+':
this.advance();
return this.createToken(TokenType.OPERATOR, '+', start, this.position, startLine, startColumn);
case '-':
this.advance();
return this.createToken(TokenType.OPERATOR, '-', start, this.position, startLine, startColumn);
case '*':
this.advance();
return this.createToken(TokenType.OPERATOR, '*', start, this.position, startLine, startColumn);
case '/':
// Check for comments
if (this.peek() === '/') {
if (this.options.preserveTrivia) {
const commentStart = this.position;
this.skipLineComment();
const token = this.createToken(
TokenType.LINE_COMMENT,
this.input.substring(commentStart, this.position),
commentStart,
this.position,
startLine,
startColumn
);
token.channel = Channel.HIDDEN;
return token;
} else {
this.skipLineComment();
return null;
}
}
if (this.peek() === '*') {
if (this.options.preserveTrivia) {
const commentStart = this.position;
this.skipBlockComment();
const token = this.createToken(
TokenType.BLOCK_COMMENT,
this.input.substring(commentStart, this.position),
commentStart,
this.position,
startLine,
startColumn
);
token.channel = Channel.HIDDEN;
return token;
} else {
this.skipBlockComment();
return null;
}
}
this.advance();
return this.createToken(TokenType.OPERATOR, '/', start, this.position, startLine, startColumn);
case '<':
this.advance();
if (this.current() === '=') {
this.advance();
return this.createToken(TokenType.OPERATOR, '<=', start, this.position, startLine, startColumn);
}
return this.createToken(TokenType.OPERATOR, '<', start, this.position, startLine, startColumn);
case '>':
this.advance();
if (this.current() === '=') {
this.advance();
return this.createToken(TokenType.OPERATOR, '>=', start, this.position, startLine, startColumn);
}
return this.createToken(TokenType.OPERATOR, '>', start, this.position, startLine, startColumn);
case '=':
this.advance();
return this.createToken(TokenType.OPERATOR, '=', start, this.position, startLine, startColumn);
case '!':
this.advance();
if (this.current() === '=') {
this.advance();
return this.createToken(TokenType.OPERATOR, '!=', start, this.position, startLine, startColumn);
} else if (this.current() === '~') {
this.advance();
return this.createToken(TokenType.OPERATOR, '!~', start, this.position, startLine, startColumn);
}
// '!' alone is not a valid token in FHIRPath
throw this.error(`Unexpected character '!' at position ${start}`);
case '~':
this.advance();
return this.createToken(TokenType.OPERATOR, '~', start, this.position, startLine, startColumn);
case '|':
this.advance();
return this.createToken(TokenType.OPERATOR, '|', start, this.position, startLine, startColumn);
case '&':
this.advance();
return this.createToken(TokenType.OPERATOR, '&', start, this.position, startLine, startColumn);
case '.':
this.advance();
return this.createToken(TokenType.DOT, '.', start, this.position, startLine, startColumn);
case ',':
this.advance();
return this.createToken(TokenType.COMMA, ',', start, this.position, startLine, startColumn);
case '(':
this.advance();
return this.createToken(TokenType.LPAREN, '(', start, this.position, startLine, startColumn);
case ')':
this.advance();
return this.createToken(TokenType.RPAREN, ')', start, this.position, startLine, startColumn);
case '[':
this.advance();
return this.createToken(TokenType.LBRACKET, '[', start, this.position, startLine, startColumn);
case ']':
this.advance();
return this.createToken(TokenType.RBRACKET, ']', start, this.position, startLine, startColumn);
case '{':
this.advance();
return this.createToken(TokenType.LBRACE, '{', start, this.position, startLine, startColumn);
case '}':
this.advance();
return this.createToken(TokenType.RBRACE, '}', start, this.position, startLine, startColumn);
case '%':
return this.readEnvironmentVariable();
case "'":
return this.readString("'");
case '"':
// Not in spec but often supported
return this.readString('"');
case '`':
return this.readDelimitedIdentifier();
case '@':
return this.readDateTimeOrTime();
case '$':
return this.readSpecialIdentifier();
}
// Numbers
if (charCode >= 48 && charCode <= 57) { // 0-9
return this.readNumber();
}
// Identifiers (including all keywords)
if ((charCode >= 65 && charCode <= 90) || // A-Z
(charCode >= 97 && charCode <= 122) || // a-z
charCode === 95) { // _
return this.readIdentifier();
}
throw this.error(`Unexpected character '${char}' at position ${this.position}`);
}
private readIdentifier(): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
// First character is already validated
this.advance();
// Continue with alphanumeric or underscore
while (this.position < this.input.length) {
const charCode = this.input.charCodeAt(this.position);
if ((charCode >= 65 && charCode <= 90) || // A-Z
(charCode >= 97 && charCode <= 122) || // a-z
(charCode >= 48 && charCode <= 57) || // 0-9
charCode === 95) { // _
this.advance();
} else {
break;
}
}
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.IDENTIFIER, value, start, this.position, startLine, startColumn);
}
private readDelimitedIdentifier(): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
this.advance(); // Skip opening `
while (this.position < this.input.length) {
const char = this.current();
if (char === '`') {
this.advance(); // Skip closing `
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.IDENTIFIER, value, start, this.position, startLine, startColumn);
}
if (char === '\\') {
this.advance(); // Skip escape character
if (this.position >= this.input.length) {
throw this.error('Unterminated delimited identifier');
}
}
this.advance();
}
throw this.error('Unterminated delimited identifier');
}
private readSpecialIdentifier(): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
this.advance(); // Skip $
// Read the identifier part
while (this.position < this.input.length) {
const charCode = this.input.charCodeAt(this.position);
if ((charCode >= 65 && charCode <= 90) || // A-Z
(charCode >= 97 && charCode <= 122) || // a-z
(charCode >= 48 && charCode <= 57) || // 0-9
charCode === 95) { // _
this.advance();
} else {
break;
}
}
const value = this.input.substring(start, this.position);
// All $... tokens are SPECIAL_IDENTIFIER
return this.createToken(TokenType.SPECIAL_IDENTIFIER, value, start, this.position, startLine, startColumn);
}
private readEnvironmentVariable(): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
this.advance(); // Skip %
// Check what follows %
const char = this.current();
if (char === '`') {
// Delimited identifier: %`identifier`
this.advance(); // Skip opening `
while (this.position < this.input.length) {
const ch = this.current();
if (ch === '`') {
this.advance(); // Skip closing `
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.ENVIRONMENT_VARIABLE, value, start, this.position, startLine, startColumn);
}
if (ch === '\\') {
this.advance(); // Skip escape character
if (this.position >= this.input.length) {
throw this.error('Unterminated environment variable');
}
}
this.advance();
}
throw this.error('Unterminated environment variable');
} else if (char === "'") {
// String format (backwards compatibility): %'identifier'
this.advance(); // Skip opening '
while (this.position < this.input.length) {
const ch = this.current();
if (ch === "'") {
this.advance(); // Skip closing '
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.ENVIRONMENT_VARIABLE, value, start, this.position, startLine, startColumn);
}
if (ch === '\\') {
this.advance(); // Skip escape character
if (this.position >= this.input.length) {
throw this.error('Unterminated environment variable');
}
}
this.advance();
}
throw this.error('Unterminated environment variable');
} else {
// Simple identifier: %identifier
const charCode = this.input.charCodeAt(this.position);
if (!((charCode >= 65 && charCode <= 90) || // A-Z
(charCode >= 97 && charCode <= 122) || // a-z
charCode === 95)) { // _
throw this.error('Invalid environment variable name');
}
// Read the identifier part
while (this.position < this.input.length) {
const charCode = this.input.charCodeAt(this.position);
if ((charCode >= 65 && charCode <= 90) || // A-Z
(charCode >= 97 && charCode <= 122) || // a-z
(charCode >= 48 && charCode <= 57) || // 0-9
charCode === 95) { // _
this.advance();
} else {
break;
}
}
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.ENVIRONMENT_VARIABLE, value, start, this.position, startLine, startColumn);
}
}
private readString(quote: string): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
this.advance(); // Skip opening quote
while (this.position < this.input.length) {
const char = this.current();
if (char === quote) {
this.advance(); // Skip closing quote
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.STRING, value, start, this.position, startLine, startColumn);
}
if (char === '\\') {
this.advance(); // Skip escape character
if (this.position >= this.input.length) {
throw this.error('Unterminated string');
}
// Skip the escaped character
this.advance();
} else {
this.advance();
}
}
throw this.error('Unterminated string');
}
private readNumber(): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
// Read integer part
while (this.position < this.input.length && this.isDigit(this.current())) {
this.advance();
}
// Check for decimal part
if (this.current() === '.' && this.position + 1 < this.input.length && this.input[this.position + 1] && this.isDigit(this.input[this.position + 1]!)) {
this.advance(); // Skip .
while (this.position < this.input.length && this.isDigit(this.current())) {
this.advance();
}
}
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.NUMBER, value, start, this.position, startLine, startColumn);
}
private readDateTimeOrTime(): Token {
const start = this.position;
const startLine = this.line;
const startColumn = this.column;
this.advance(); // Skip @
// Check if it's a time (starts with T)
if (this.current() === 'T') {
return this.readTime(start, startLine, startColumn);
}
// Otherwise it's a datetime
return this.readDateTime(start, startLine, startColumn);
}
private readDateTime(start: number, startLine: number, startColumn: number): Token {
// Year (4 digits required)
for (let i = 0; i < 4; i++) {
if (!this.isDigit(this.current())) {
throw this.error('Invalid datetime format');
}
this.advance();
}
// Optional month, day, time parts
if (this.current() === '-') {
this.advance();
// Month (2 digits)
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
throw this.error('Invalid datetime format');
}
this.advance();
}
if (this.current() === '-') {
this.advance();
// Day (2 digits)
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
throw this.error('Invalid datetime format');
}
this.advance();
}
}
}
// Optional time part
if (this.current() === 'T') {
this.advance();
this.readTimeFormat();
}
// Optional timezone
this.readTimezone();
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.DATETIME, value, start, this.position, startLine, startColumn);
}
private readTime(start: number, startLine: number, startColumn: number): Token {
this.advance(); // Skip T
this.readTimeFormat();
const value = this.input.substring(start, this.position);
return this.createToken(TokenType.TIME, value, start, this.position, startLine, startColumn);
}
private readTimeFormat(): void {
// Hour (2 digits)
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
return; // Time format is optional in datetime
}
this.advance();
}
// Optional minutes
if (this.current() === ':') {
this.advance();
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
throw this.error('Invalid time format');
}
this.advance();
}
// Optional seconds
if (this.current() === ':') {
this.advance();
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
throw this.error('Invalid time format');
}
this.advance();
}
// Optional milliseconds
if (this.current() === '.') {
this.advance();
if (!this.isDigit(this.current())) {
throw this.error('Invalid time format');
}
while (this.isDigit(this.current())) {
this.advance();
}
}
}
}
}
private readTimezone(): void {
const char = this.current();
if (char === 'Z') {
this.advance();
} else if (char === '+' || char === '-') {
this.advance();
// Hour (2 digits)
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
return; // Timezone is optional
}
this.advance();
}
if (this.current() === ':') {
this.advance();
// Minutes (2 digits)
for (let i = 0; i < 2; i++) {
if (!this.isDigit(this.current())) {
throw this.error('Invalid timezone format');
}
this.advance();
}
}
}
}
private skipWhitespace(): void {
while (this.position < this.input.length) {
const char = this.current();
if (char === ' ' || char === '\t' || char === '\r' || char === '\n') {
this.advance();
} else {
break;
}
}
}
private skipLineComment(): void {
// Skip //
this.advance();
this.advance();
// Skip until end of line
while (this.position < this.input.length && this.current() !== '\n') {
this.advance();
}
}
private skipBlockComment(): void {
// Skip /*
this.advance();
this.advance();
// Skip until */
while (this.position < this.input.length) {
if (this.current() === '*' && this.peek() === '/') {
this.advance(); // Skip *
this.advance(); // Skip /
break;
}
this.advance();
}
}
private advance(): void {
if (this.position < this.input.length) {
const char = this.input[this.position]!;
if (this.options.trackPosition) {
if (char === '\n') {
this.line++;
this.column = 1;
this.lspLine++;
this.lspCharacter = 0;
} else if (char === '\r') {
// Handle \r\n as single line ending
if (this.position + 1 < this.input.length && this.input[this.position + 1] === '\n') {
// Don't update line yet, wait for \n
} else {
// Standalone \r
this.line++;
this.column = 1;
this.lspLine++;
this.lspCharacter = 0;
}
} else {
this.column++;
this.lspCharacter++;
}
}
this.position++;
}
}
private current(): string {
return this.position < this.input.length ? this.input[this.position]! : '';
}
private peek(): string {
return this.position + 1 < this.input.length ? this.input[this.position + 1]! : '';
}
private isDigit(char: string): boolean {
if (!char) return false;
const code = char.charCodeAt(0);
return code >= 48 && code <= 57; // 0-9
}
private isWhitespace(char: string): boolean {
if (!char) return false;
return char === ' ' || char === '\t' || char === '\n' || char === '\r';
}
/**
* Convert absolute offset to LSP Position
*/
private offsetToPosition(offset: number): Position {
if (!this.options.trackPosition) {
return { line: 0, character: 0, offset };
}
// Binary search for the line
let low = 0;
let high = this.lineOffsets.length - 1;
while (low < high) {
const mid = Math.floor((low + high + 1) / 2);
if (this.lineOffsets[mid]! <= offset) {
low = mid;
} else {
high = mid - 1;
}
}
const line = low;
const lineStart = this.lineOffsets[line]!;
const character = offset - lineStart;
return { line, character, offset };
}
private createToken(
type: TokenType,
value: string,
start: number,
end: number,
line: number = this.line,
column: number = this.column
): Token {
const token: Token = {
type,
value,
start,
end,
line: this.options.trackPosition ? line : 0,
column: this.options.trackPosition ? column : 0,
};
// Add LSP-compatible range if tracking positions
if (this.options.trackPosition) {
const startPos = this.offsetToPosition(start);
const endPos = this.offsetToPosition(end);
token.range = {
start: startPos,
end: endPos
};
}
return token;
}
private error(message: string): Error {
return new Error(`Lexer error: ${message}`);
}
// Public methods for parser use
/**
* Get the text value for a token
*/
getTokenText(token: Token): string {
return token.value;
}
/**
* Check if a token is an identifier (including keyword operators)
*/
static isIdentifier(token: Token): boolean {
return token.type === TokenType.IDENTIFIER;
}
/**
* Check if a token could be a keyword operator (parser decides)
*/
static couldBeKeywordOperator(token: Token): boolean {
return token.type === TokenType.IDENTIFIER;
}
}
// Export a type-safe token type checker
export function isTokenType(token: Token, type: TokenType): boolean {
return token.type === type;
}
// Helper to check if a token is an operator
export function isOperator(token: Token): boolean {
return token.type === TokenType.OPERATOR;
}
// Helper to check if a token is a specific operator
export function isOperatorValue(token: Token, value: string): boolean {
return token.type === TokenType.OPERATOR && token.value === value;
}
// Helper to check if a token is an environment variable
export function isEnvironmentVariable(token: Token): boolean {
return token.type === TokenType.ENVIRONMENT_VARIABLE;
}