@fluent/bundle
Version:
Localization library for expressive translations.
437 lines (436 loc) • 17.9 kB
JavaScript
// This regex is used to iterate through the beginnings of messages and terms.
// With the /m flag, the ^ matches at the beginning of every line.
const RE_MESSAGE_START = /^(-?[a-zA-Z][\w-]*) *= */gm;
// Both Attributes and Variants are parsed in while loops. These regexes are
// used to break out of them.
const RE_ATTRIBUTE_START = /\.([a-zA-Z][\w-]*) *= */y;
const RE_VARIANT_START = /\*?\[/y;
const RE_NUMBER_LITERAL = /(-?[0-9]+(?:\.([0-9]+))?)/y;
const RE_IDENTIFIER = /([a-zA-Z][\w-]*)/y;
const RE_REFERENCE = /([$-])?([a-zA-Z][\w-]*)(?:\.([a-zA-Z][\w-]*))?/y;
const RE_FUNCTION_NAME = /^[A-Z][A-Z0-9_-]*$/;
// A "run" is a sequence of text or string literal characters which don't
// require any special handling. For TextElements such special characters are: {
// (starts a placeable), and line breaks which require additional logic to check
// if the next line is indented. For StringLiterals they are: \ (starts an
// escape sequence), " (ends the literal), and line breaks which are not allowed
// in StringLiterals. Note that string runs may be empty; text runs may not.
const RE_TEXT_RUN = /([^{}\n\r]+)/y;
const RE_STRING_RUN = /([^\\"\n\r]*)/y;
// Escape sequences.
const RE_STRING_ESCAPE = /\\([\\"])/y;
const RE_UNICODE_ESCAPE = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{6})/y;
// Used for trimming TextElements and indents.
const RE_LEADING_NEWLINES = /^\n+/;
const RE_TRAILING_SPACES = / +$/;
// Used in makeIndent to strip spaces from blank lines and normalize CRLF to LF.
const RE_BLANK_LINES = / *\r?\n/g;
// Used in makeIndent to measure the indentation.
const RE_INDENT = /( *)$/;
// Common tokens.
const TOKEN_BRACE_OPEN = /{\s*/y;
const TOKEN_BRACE_CLOSE = /\s*}/y;
const TOKEN_BRACKET_OPEN = /\[\s*/y;
const TOKEN_BRACKET_CLOSE = /\s*] */y;
const TOKEN_PAREN_OPEN = /\s*\(\s*/y;
const TOKEN_ARROW = /\s*->\s*/y;
const TOKEN_COLON = /\s*:\s*/y;
// Note the optional comma. As a deviation from the Fluent EBNF, the parser
// doesn't enforce commas between call arguments.
const TOKEN_COMMA = /\s*,?\s*/y;
const TOKEN_BLANK = /\s+/y;
/**
* Fluent Resource is a structure storing parsed localization entries.
*/
export class FluentResource {
constructor(source) {
this.body = [];
RE_MESSAGE_START.lastIndex = 0;
let cursor = 0;
// Iterate over the beginnings of messages and terms to efficiently skip
// comments and recover from errors.
while (true) {
let next = RE_MESSAGE_START.exec(source);
if (next === null) {
break;
}
cursor = RE_MESSAGE_START.lastIndex;
try {
this.body.push(parseMessage(next[1]));
}
catch (err) {
if (err instanceof SyntaxError) {
// Don't report any Fluent syntax errors. Skip directly to the
// beginning of the next message or term.
continue;
}
throw err;
}
}
// The parser implementation is inlined below for performance reasons,
// as well as for convenience of accessing `source` and `cursor`.
// The parser focuses on minimizing the number of false negatives at the
// expense of increasing the risk of false positives. In other words, it
// aims at parsing valid Fluent messages with a success rate of 100%, but it
// may also parse a few invalid messages which the reference parser would
// reject. The parser doesn't perform any validation and may produce entries
// which wouldn't make sense in the real world. For best results users are
// advised to validate translations with the fluent-syntax parser
// pre-runtime.
// The parser makes an extensive use of sticky regexes which can be anchored
// to any offset of the source string without slicing it. Errors are thrown
// to bail out of parsing of ill-formed messages.
function test(re) {
re.lastIndex = cursor;
return re.test(source);
}
// Advance the cursor by the char if it matches. May be used as a predicate
// (was the match found?) or, if errorClass is passed, as an assertion.
function consumeChar(char, errorClass) {
if (source[cursor] === char) {
cursor++;
return true;
}
if (errorClass) {
throw new errorClass(`Expected ${char}`);
}
return false;
}
// Advance the cursor by the token if it matches. May be used as a predicate
// (was the match found?) or, if errorClass is passed, as an assertion.
function consumeToken(re, errorClass) {
if (test(re)) {
cursor = re.lastIndex;
return true;
}
if (errorClass) {
throw new errorClass(`Expected ${re.toString()}`);
}
return false;
}
// Execute a regex, advance the cursor, and return all capture groups.
function match(re) {
re.lastIndex = cursor;
let result = re.exec(source);
if (result === null) {
throw new SyntaxError(`Expected ${re.toString()}`);
}
cursor = re.lastIndex;
return result;
}
// Execute a regex, advance the cursor, and return the capture group.
function match1(re) {
return match(re)[1];
}
function parseMessage(id) {
let value = parsePattern();
let attributes = parseAttributes();
if (value === null && Object.keys(attributes).length === 0) {
throw new SyntaxError("Expected message value or attributes");
}
return { id, value, attributes };
}
function parseAttributes() {
let attrs = Object.create(null);
while (test(RE_ATTRIBUTE_START)) {
let name = match1(RE_ATTRIBUTE_START);
let value = parsePattern();
if (value === null) {
throw new SyntaxError("Expected attribute value");
}
attrs[name] = value;
}
return attrs;
}
function parsePattern() {
let first;
// First try to parse any simple text on the same line as the id.
if (test(RE_TEXT_RUN)) {
first = match1(RE_TEXT_RUN);
}
// If there's a placeable on the first line, parse a complex pattern.
if (source[cursor] === "{" || source[cursor] === "}") {
// Re-use the text parsed above, if possible.
return parsePatternElements(first ? [first] : [], Infinity);
}
// RE_TEXT_VALUE stops at newlines. Only continue parsing the pattern if
// what comes after the newline is indented.
let indent = parseIndent();
if (indent) {
if (first) {
// If there's text on the first line, the blank block is part of the
// translation content in its entirety.
return parsePatternElements([first, indent], indent.length);
}
// Otherwise, we're dealing with a block pattern, i.e. a pattern which
// starts on a new line. Discrad the leading newlines but keep the
// inline indent; it will be used by the dedentation logic.
indent.value = trim(indent.value, RE_LEADING_NEWLINES);
return parsePatternElements([indent], indent.length);
}
if (first) {
// It was just a simple inline text after all.
return trim(first, RE_TRAILING_SPACES);
}
return null;
}
// Parse a complex pattern as an array of elements.
function parsePatternElements(elements = [], commonIndent) {
while (true) {
if (test(RE_TEXT_RUN)) {
elements.push(match1(RE_TEXT_RUN));
continue;
}
if (source[cursor] === "{") {
elements.push(parsePlaceable());
continue;
}
if (source[cursor] === "}") {
throw new SyntaxError("Unbalanced closing brace");
}
let indent = parseIndent();
if (indent) {
elements.push(indent);
commonIndent = Math.min(commonIndent, indent.length);
continue;
}
break;
}
let lastIndex = elements.length - 1;
let lastElement = elements[lastIndex];
// Trim the trailing spaces in the last element if it's a TextElement.
if (typeof lastElement === "string") {
elements[lastIndex] = trim(lastElement, RE_TRAILING_SPACES);
}
let baked = [];
for (let element of elements) {
if (element instanceof Indent) {
// Dedent indented lines by the maximum common indent.
element = element.value.slice(0, element.value.length - commonIndent);
}
if (element) {
baked.push(element);
}
}
return baked;
}
function parsePlaceable() {
consumeToken(TOKEN_BRACE_OPEN, SyntaxError);
let selector = parseInlineExpression();
if (consumeToken(TOKEN_BRACE_CLOSE)) {
return selector;
}
if (consumeToken(TOKEN_ARROW)) {
let variants = parseVariants();
consumeToken(TOKEN_BRACE_CLOSE, SyntaxError);
return {
type: "select",
selector,
...variants,
};
}
throw new SyntaxError("Unclosed placeable");
}
function parseInlineExpression() {
if (source[cursor] === "{") {
// It's a nested placeable.
return parsePlaceable();
}
if (test(RE_REFERENCE)) {
let [, sigil, name, attr = null] = match(RE_REFERENCE);
if (sigil === "$") {
return { type: "var", name };
}
if (consumeToken(TOKEN_PAREN_OPEN)) {
let args = parseArguments();
if (sigil === "-") {
// A parameterized term: -term(...).
return { type: "term", name, attr, args };
}
if (RE_FUNCTION_NAME.test(name)) {
return { type: "func", name, args };
}
throw new SyntaxError("Function names must be all upper-case");
}
if (sigil === "-") {
// A non-parameterized term: -term.
return {
type: "term",
name,
attr,
args: [],
};
}
return { type: "mesg", name, attr };
}
return parseLiteral();
}
function parseArguments() {
let args = [];
while (true) {
switch (source[cursor]) {
case ")": // End of the argument list.
cursor++;
return args;
case undefined: // EOF
throw new SyntaxError("Unclosed argument list");
}
args.push(parseArgument());
// Commas between arguments are treated as whitespace.
consumeToken(TOKEN_COMMA);
}
}
function parseArgument() {
let expr = parseInlineExpression();
if (expr.type !== "mesg") {
return expr;
}
if (consumeToken(TOKEN_COLON)) {
// The reference is the beginning of a named argument.
return {
type: "narg",
name: expr.name,
value: parseLiteral(),
};
}
// It's a regular message reference.
return expr;
}
function parseVariants() {
let variants = [];
let count = 0;
let star;
while (test(RE_VARIANT_START)) {
if (consumeChar("*")) {
star = count;
}
let key = parseVariantKey();
let value = parsePattern();
if (value === null) {
throw new SyntaxError("Expected variant value");
}
variants[count++] = { key, value };
}
if (count === 0) {
return null;
}
if (star === undefined) {
throw new SyntaxError("Expected default variant");
}
return { variants, star };
}
function parseVariantKey() {
consumeToken(TOKEN_BRACKET_OPEN, SyntaxError);
let key;
if (test(RE_NUMBER_LITERAL)) {
key = parseNumberLiteral();
}
else {
key = {
type: "str",
value: match1(RE_IDENTIFIER),
};
}
consumeToken(TOKEN_BRACKET_CLOSE, SyntaxError);
return key;
}
function parseLiteral() {
if (test(RE_NUMBER_LITERAL)) {
return parseNumberLiteral();
}
if (source[cursor] === '"') {
return parseStringLiteral();
}
throw new SyntaxError("Invalid expression");
}
function parseNumberLiteral() {
let [, value, fraction = ""] = match(RE_NUMBER_LITERAL);
let precision = fraction.length;
return {
type: "num",
value: parseFloat(value),
precision,
};
}
function parseStringLiteral() {
consumeChar('"', SyntaxError);
let value = "";
while (true) {
value += match1(RE_STRING_RUN);
if (source[cursor] === "\\") {
value += parseEscapeSequence();
continue;
}
if (consumeChar('"')) {
return { type: "str", value };
}
// We've reached an EOL of EOF.
throw new SyntaxError("Unclosed string literal");
}
}
// Unescape known escape sequences.
function parseEscapeSequence() {
if (test(RE_STRING_ESCAPE)) {
return match1(RE_STRING_ESCAPE);
}
if (test(RE_UNICODE_ESCAPE)) {
let [, codepoint4, codepoint6] = match(RE_UNICODE_ESCAPE);
let codepoint = parseInt(codepoint4 || codepoint6, 16);
return codepoint <= 0xd7ff || 0xe000 <= codepoint
? // It's a Unicode scalar value.
String.fromCodePoint(codepoint)
: // Lonely surrogates can cause trouble when the parsing result is
// saved using UTF-8. Use U+FFFD REPLACEMENT CHARACTER instead.
"�";
}
throw new SyntaxError("Unknown escape sequence");
}
// Parse blank space. Return it if it looks like indent before a pattern
// line. Skip it othwerwise.
function parseIndent() {
let start = cursor;
consumeToken(TOKEN_BLANK);
// Check the first non-blank character after the indent.
switch (source[cursor]) {
case ".":
case "[":
case "*":
case "}":
case undefined: // EOF
// A special character. End the Pattern.
return false;
case "{":
// Placeables don't require indentation (in EBNF: block-placeable).
// Continue the Pattern.
return makeIndent(source.slice(start, cursor));
}
// If the first character on the line is not one of the special characters
// listed above, it's a regular text character. Check if there's at least
// one space of indent before it.
if (source[cursor - 1] === " ") {
// It's an indented text character (in EBNF: indented-char). Continue
// the Pattern.
return makeIndent(source.slice(start, cursor));
}
// A not-indented text character is likely the identifier of the next
// message. End the Pattern.
return false;
}
// Trim blanks in text according to the given regex.
function trim(text, re) {
return text.replace(re, "");
}
// Normalize a blank block and extract the indent details.
function makeIndent(blank) {
let value = blank.replace(RE_BLANK_LINES, "\n");
let length = RE_INDENT.exec(blank)[1].length;
return new Indent(value, length);
}
}
}
class Indent {
constructor(value, length) {
this.value = value;
this.length = length;
}
}