@hi18n/core
Version:
Message internationalization meets immutability and type-safety - core runtime
489 lines (464 loc) • 15.1 kB
text/typescript
import { ParseError } from "./errors.ts";
import {
DateTimeArg,
NumberArg,
StringArg,
type CompiledMessage,
type ElementArg,
type PluralArg,
type PluralBranch,
type VarArg,
} from "./msgfmt.ts";
const SIMPLE_MESSAGE = /^[^'{}<]*$/;
export function parseMessage(msg: string): CompiledMessage {
if (SIMPLE_MESSAGE.test(msg)) return msg;
return parseMessageEOF.call(createParser(msg));
}
type ArgType = "number" | "date" | "time";
const ARG_TYPES = ["number", "date", "time"];
// References for ICU MessageFormat syntax:
// https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/MessageFormat.html
// https://unicode-org.github.io/icu/userguide/format_parse/messages/
interface Parser {
src: string;
pos: number;
reText: RegExp;
reQuotedText: RegExp;
}
function createParser(src: string): Parser {
return {
src,
pos: 0,
reText: /[^'{}#<]*/y,
reQuotedText: /[^']*/y,
};
}
function parseMessageEOF(this: Parser): CompiledMessage {
const msg = parseMessage_.call(this);
if (this.pos < this.src.length) {
throw new ParseError(`Found an unmatching ${this.src[this.pos]!}`);
}
return msg;
}
// message = messageText (argument messageText)*
// The grammar doesn't mention it but it should also have '#' as a special interpolation.
function parseMessage_(this: Parser, hashSubst?: VarArg): CompiledMessage {
const buf: CompiledMessage[] = [];
pushString(buf, parseMessageText.call(this, hashSubst == null));
outer: while (this.pos < this.src.length && this.src[this.pos] !== "}") {
switch (this.src[this.pos]) {
case "{":
buf.push(parseArgument.call(this));
break;
case "#":
if (!hashSubst) {
throw new Error("Bug: # found outside plural argument");
}
buf.push(hashSubst);
this.pos++;
break;
case "<":
if (this.pos + 1 < this.src.length && this.src[this.pos + 1] === "/") {
// </tag>
break outer;
} else {
// <tag> or <tag/>
buf.push(parseElement.call(this, hashSubst));
}
break;
default:
throw new Error(
`Bug: invalid syntax character: ${this.src[this.pos]!}`,
);
}
pushString(buf, parseMessageText.call(this, hashSubst == null));
}
return reduceMessage(buf);
}
// messageText consists of three parts:
//
// - plain message text
// - quoted message text
// - escaped quotes
function parseMessageText(this: Parser, allowHash: boolean): string {
let inQuote = false;
let buf = parseRawMessageText.call(this, inQuote);
while (this.pos < this.src.length) {
if (this.src[this.pos] === "'") {
if (this.pos + 1 < this.src.length && this.src[this.pos + 1] === "'") {
// Self-escaped quotation
buf += "'";
this.pos += 2;
} else if (inQuote) {
// End of quoted text
inQuote = false;
this.pos++;
} else if (
this.pos + 1 < this.src.length &&
/[{}#|<]/.test(this.src[this.pos + 1]!)
) {
// Beginning of quoted text
inQuote = true;
this.pos++;
} else {
// Literal quote
buf += "'";
this.pos++;
}
} else if (this.src[this.pos] === "#" && allowHash) {
// A plain '#' character. It is special only within pluralStyle.
buf += "#";
this.pos++;
} else {
// Syntax character ({, }, #, <)
break;
}
buf += parseRawMessageText.call(this, inQuote);
}
if (inQuote) {
throw new ParseError("Unclosed quoted string");
}
return buf;
}
// Eats up the text until it encounters a syntax character ('{', '}', '#', '<'), a quote ("'"), or EOF.
// In quoted mode, the four syntax characters ('{', '}', '#', '<') are considered part of the text.
function parseRawMessageText(this: Parser, inQuote: boolean): string {
const re = inQuote ? this.reQuotedText : this.reText;
re.lastIndex = this.pos;
const text = re.exec(this.src)![0];
this.pos += text.length;
return text;
}
// Something enclosed within {}.
// argument = noneArg | simpleArg | complexArg
// complexArg = choiceArg | pluralArg | selectArg | selectordinalArg
function parseArgument(this: Parser): CompiledMessage {
this.pos++; // Eat the open brace
const name = parseArgNameOrNumber.call(this);
switch (
nextToken.call<Parser, [readonly ["}", ","]], ["}" | ",", string]>(this, [
"}",
",",
] as const)[0]
) {
case "}":
return StringArg(name);
case ",": {
const argType_ = nextToken.call(this, ["identifier"] as const)[1];
switch (argType_) {
case "choice":
throw new ParseError("choice is not supported");
break;
case "plural":
return parsePluralArgument.call(this, name);
case "select":
case "selectordinal":
throw new Error("Unimplemented: selectArg");
break;
default: {
if (ARG_TYPES.indexOf(argType_) === -1) {
throw new ParseError(`Invalid argType: ${argType_}`);
}
const argType = argType_ as ArgType;
switch (
nextToken.call<Parser, [readonly ["}", ","]], ["}" | ",", string]>(
this,
["}", ","] as const,
)[0]
) {
case "}":
return fromArgTypeAndStyle(name, argType, undefined);
case ",": {
const argStyleToken = nextToken.call<
Parser,
[readonly ["identifier", "::"]],
["identifier" | "::", string]
>(this, ["identifier", "::"] as const);
switch (argStyleToken[0]) {
case "identifier": {
const argStyle = argStyleToken[1];
nextToken.call(this, ["}"] as const);
return fromArgTypeAndStyle(name, argType, argStyle);
}
case "::": {
if (argType !== "date") {
throw new ParseError(`Invalid argStyle for ${argType}: ::`);
}
const skeletonText = nextToken.call(this, [
"identifier",
] as const)[1];
const dateTimeFormat = parseDateSkeleton(skeletonText);
nextToken.call(this, ["}"] as const);
return DateTimeArg(name, dateTimeFormat);
}
}
}
}
}
}
}
}
}
function fromArgTypeAndStyle(
name: string | number,
argType: ArgType,
argStyle: string | undefined,
): VarArg {
switch (argType) {
case "number":
switch (argStyle) {
case undefined:
return NumberArg(name, {});
case "integer":
return NumberArg(name, { maximumFractionDigits: 0 });
case "percent":
return NumberArg(name, { style: "percent" });
default:
throw new ParseError(`Invalid argStyle for number: ${argStyle}`);
}
case "date":
switch (argStyle) {
case undefined:
case "short":
case "medium":
case "long":
case "full":
return DateTimeArg(name, { dateStyle: argStyle ?? "medium" });
default:
throw new ParseError(`Invalid argStyle for date: ${argStyle}`);
}
case "time":
switch (argStyle) {
case undefined:
case "short":
case "medium":
case "long":
case "full":
return DateTimeArg(name, { timeStyle: argStyle ?? "medium" });
default:
throw new ParseError(`Invalid argStyle for time: ${argStyle}`);
}
default:
throw new TypeError(`Unknown argType: ${argType as string}`);
}
}
// pluralStyle = [offsetValue] (selector '{' message '}')+
// offsetValue = "offset:" number
// selector = explicitValue | keyword
// explicitValue = '=' number // adjacent, no white space in between
// keyword = [^[[:Pattern_Syntax:][:Pattern_White_Space:]]]+
function parsePluralArgument(this: Parser, name: string | number): PluralArg {
nextToken.call(this, [","]);
let token = nextToken.call(this, [
"offset:",
"identifier",
"=",
"}",
] as const);
let offset: number | undefined = undefined;
if (token[0] === "offset:") {
offset = parseNumber(nextToken.call(this, ["number"] as const)[1]);
token = nextToken.call(this, ["identifier", "=", "}"] as const);
}
const branches: PluralBranch[] = [];
while (token[0] !== "}") {
let selector: string | number;
if (token[0] === "=") {
selector = parseNumber(nextToken.call(this, ["number"], ["number"])[1]);
} else {
selector = token[1];
}
nextToken.call(this, ["{"]);
const hashSubst = NumberArg(name, {}, { subtract: offset ?? 0 });
const message = parseMessage_.call(this, hashSubst);
nextToken.call(this, ["}"]);
branches.push({ selector, message });
token = nextToken.call(this, ["identifier", "=", "}"] as const);
}
if (branches.length === 0) throw new ParseError("No branch found");
if (branches[branches.length - 1]!.selector !== "other")
throw new ParseError("Last selector should be other");
const fallback = branches.pop()!.message;
return { type: "Plural", name, subtract: offset ?? 0, branches, fallback };
}
// <tag>message</tag> or <tag/>
function parseElement(this: Parser, hashSubst?: VarArg): ElementArg {
this.pos++; // Eat <
const name = parseArgNameOrNumber.call(this, true);
if (nextToken.call(this, ["/", ">"] as const)[0] === "/") {
// <tag/>
nextToken.call(this, [">"], [">"]);
return {
type: "Element",
name,
message: undefined,
};
}
// <tag>message</tag>
const message = parseMessage_.call(this, hashSubst);
nextToken.call(this, ["<"]);
nextToken.call(this, ["/"], ["/"]);
const closingName = parseArgNameOrNumber.call(this, true);
nextToken.call(this, [">"]);
if (name !== closingName) {
throw new ParseError(
`Tag ${name} closed with a different name: ${closingName}`,
);
}
return {
type: "Element",
name,
message,
};
}
// argNameOrNumber = argName | argNumber
// argName = [^[[:Pattern_Syntax:][:Pattern_White_Space:]]]+
// argNumber = '0' | ('1'..'9' ('0'..'9')*)
function parseArgNameOrNumber(this: Parser, noSpace = false): number | string {
const [kind, token] = nextToken.call(
this,
["number", "identifier"] as const,
noSpace ? ["number", "identifier"] : undefined,
);
if (kind === "number") return parseNumber(token);
return token;
}
function nextToken<E extends readonly string[]>(
this: Parser,
expected: E,
noWhitespace?: string[],
): [E[number], string] {
const [kind, token, foundWhitespace] = nextTokenImpl.call(this);
if (expected.indexOf(kind) === -1)
throw new ParseError(
`Unexpected token ${kind} (expected ${expected.join(", ")})`,
);
if (noWhitespace && foundWhitespace && noWhitespace.indexOf(kind) !== -1)
throw new ParseError("No space allowed here");
return [kind, token];
}
function nextTokenImpl(this: Parser): [string, string, boolean] {
const foundWhitespace = skipWhitespace.call(this);
if (this.pos >= this.src.length) return ["EOF", "", foundWhitespace];
const ch = this.src[this.pos]!;
const start = this.pos;
let kind: string;
if (this.src.startsWith("offset:", this.pos)) {
kind = "offset:";
this.pos += "offset:".length;
// It should be /[\p{Pattern_Syntax}\p{Pattern_White_Space}]/u
// but for compatibility reasons I'm not yet sure we can use it now.
} else if (/[0-9A-Z_a-z]/.test(ch)) {
kind = /[0-9]/.test(ch) ? "number" : "identifier";
while (
this.pos < this.src.length &&
/[0-9A-Z_a-z]/.test(this.src[this.pos]!)
) {
this.pos++;
}
} else if (this.src.startsWith("::", this.pos)) {
kind = "::";
this.pos += "::".length;
} else {
kind = ch;
this.pos++;
}
return [kind, this.src.substring(start, this.pos), foundWhitespace];
}
function skipWhitespace(this: Parser): boolean {
const oldPos = this.pos;
while (this.pos < this.src.length && /\s/.test(this.src[this.pos]!))
this.pos++;
return this.pos > oldPos;
}
function parseNumber(token: string): number {
if (!/^(?:0|[1-9][0-9]*)$/.test(token))
throw new ParseError(`Invalid number: ${token}`);
return parseInt(token);
}
function reduceMessage(msg: CompiledMessage[]): CompiledMessage {
if (msg.length === 1) {
return msg[0]!;
} else if (msg.length === 0) {
return "";
} else {
return msg;
}
}
function pushString(buf: CompiledMessage[], msg: string) {
if (msg !== "") buf.push(msg);
}
function parseDateSkeleton(skeleton: string) {
const options: Record<string, string | number | undefined> = {};
// for (const match of skeleton.matchAll(/(.)\1*/g)) {
for (const match of skeletonTokens(skeleton)) {
if (Object.prototype.hasOwnProperty.call(dateTokenMap, match[1]!)) {
const array = dateTokenMap[match[1]!]!;
const value = array[match[0]!.length];
if (value !== "undefined") {
options[array[0]] = value;
if (/[hHkK]/.test(match[1]!)) {
options["hourCycle"] =
hourCycleMap[match[1] as "h" | "H" | "k" | "K"];
}
continue;
}
}
throw new ParseError(`Invalid date skeleton: ${match[0]!}`);
}
if (requiredDateFields.every((f) => options[f] === undefined)) {
throw new ParseError(
`Insufficient fields in the date skeleton: ${skeleton}`,
);
}
return options as Intl.DateTimeFormatOptions;
}
function skeletonTokens(skeleton: string): [string?, string?][] {
const tokens: [string?, string?][] = [];
for (let i = 0; i < skeleton.length; ) {
const start = i;
const ch = skeleton[i]!;
for (; i < skeleton.length && skeleton[i] === ch; i++);
tokens.push([skeleton.substring(start, i), ch]);
}
return tokens;
}
const requiredDateFields = [
"weekday",
"year",
"month",
"day",
"dayPeriod",
"hour",
"minute",
"second",
"fractionalSecondDigits",
];
const dateTokenMap: Record<
string,
[string, ...(string | number | undefined)[]]
> = {
G: ["era", "short", undefined, undefined, "long", "narrow"],
y: ["year", "numeric", "2-digit"],
M: ["month", "numeric", "2-digit", "short", "long", "narrow"],
d: ["day", "numeric", "2-digit"],
E: ["weekday", "short", undefined, undefined, "long", "narrow"],
a: ["dayPeriod", "short", undefined, undefined, "long", "narrow"],
h: ["hour", "numeric", "2-digit"],
H: ["hour", "numeric", "2-digit"],
k: ["hour", "numeric", "2-digit"],
K: ["hour", "numeric", "2-digit"],
j: ["hour", "numeric", "2-digit"],
m: ["minute", "numeric", "2-digit"],
s: ["second", "numeric", "2-digit"],
S: ["fractionalSecondDigits", 1, 2, 3],
z: ["timeZoneName", "short", undefined, undefined, "long"],
O: ["timeZoneName", "shortOffset", undefined, undefined, "longOffset"],
v: ["timeZoneName", "shortGeneric", undefined, undefined, "longGeneric"],
};
const hourCycleMap = {
h: "h12",
H: "h23",
k: "h24",
K: "h11",
};