UNPKG

gettext-parser

Version:

Parse and compile gettext po and mo files to/from json, nothing more, nothing less

github.com/smhg/gettext-parser

smhg/gettext-parser

637 lines (551 loc) • 16.7 kB

JavaScript

import encoding from 'encoding'; import { formatCharset, parseNPluralFromHeadersSafely, parseHeader } from './shared.js'; import { Transform } from 'readable-stream'; import util from 'util'; /** * Parses a PO object into translation table * * @typedef {{ defaultCharset?: string, validation?: boolean }} Options * @param {string | Buffer} input PO object * @param {Options} [options] Optional options with defaultCharset and validation */ export function parse (input, options = {}) { const parser = new Parser(input, options); return parser.parse(); }; /** * Parses a PO stream, emits translation table in object mode * * @typedef {{ defaultCharset: strubg, validation: boolean }} Options * @param {Options} [options] Optional options with defaultCharset and validation * @param {import('readable-stream').TransformOptions} [transformOptions] Optional stream options */ export function stream (options = {}, transformOptions = {}) { return new PoParserTransform(options, transformOptions); }; /** * Creates a PO parser object. If PO object is a string, * UTF-8 will be used as the charset * * @typedef {{ defaultCharset?: string, validation?: boolean }} Options * @constructor * @param {string | Buffer} fileContents PO object * @param {Options} options Options with defaultCharset and validation */ function Parser (fileContents, { defaultCharset = 'iso-8859-1', validation = false }) { this._validation = validation; this._charset = defaultCharset; this._lex = []; this._escaped = false; this._node = {}; this._state = this.states.none; this._lineNumber = 1; if (typeof fileContents === 'string') { this._charset = 'utf-8'; this._fileContents = fileContents; } else { this._fileContents = this._handleCharset(fileContents); } } /** * Parses the PO object and returns translation table * * @return {Object} Translation table */ Parser.prototype.parse = function () { this._lexer(this._fileContents); return this._finalize(this._lex); }; /** * Detects charset for PO strings from the header * * @param {Buffer} headers Header value */ Parser.prototype._handleCharset = function (buf = '') { const str = buf.toString(); let pos; let headers = ''; let match; if ((pos = str.search(/^\s*msgid/im)) >= 0) { pos = pos + str.substr(pos + 5).search(/^\s*(msgid|msgctxt)/im); headers = str.substr(0, pos >= 0 ? pos + 5 : str.length); } if ((match = headers.match(/[; ]charset\s*=\s*([\w-]+)(?:[\s;]|\\n)*"\s*$/mi))) { this._charset = formatCharset(match[1], this._charset); } if (this._charset === 'utf-8') { return str; } return this._toString(buf); }; Parser.prototype._toString = function (buf) { return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8'); }; /** * State constants for parsing FSM */ Parser.prototype.states = { none: 0x01, comments: 0x02, key: 0x03, string: 0x04, obsolete: 0x05 }; /** * Value types for lexer */ Parser.prototype.types = { comments: 0x01, key: 0x02, string: 0x03, obsolete: 0x04 }; /** * String matches for lexer */ Parser.prototype.symbols = { quotes: /["']/, comments: /#/, whitespace: /\s/, key: /[\w\-[\]]/, keyNames: /^(?:msgctxt|msgid(?:_plural)?|msgstr(?:\[\d+])?)$/ }; /** * Token parser. Parsed state can be found from this._lex * * @param {String} chunk String */ Parser.prototype._lexer = function (chunk) { let chr; for (let i = 0, len = chunk.length; i < len; i++) { chr = chunk.charAt(i); if (chr === '\n') { this._lineNumber += 1; } switch (this._state) { case this.states.none: case this.states.obsolete: if (chr.match(this.symbols.quotes)) { this._node = { type: this.types.string, value: '', quote: chr }; this._lex.push(this._node); this._state = this.states.string; } else if (chr.match(this.symbols.comments)) { this._node = { type: this.types.comments, value: '' }; this._lex.push(this._node); this._state = this.states.comments; } else if (!chr.match(this.symbols.whitespace)) { this._node = { type: this.types.key, value: chr }; if (this._state === this.states.obsolete) { this._node.obsolete = true; } this._lex.push(this._node); this._state = this.states.key; } break; case this.states.comments: if (chr === '\n') { this._state = this.states.none; } else if (chr === '~' && this._node.value === '') { this._node.value += chr; this._state = this.states.obsolete; } else if (chr !== '\r') { this._node.value += chr; } break; case this.states.string: if (this._escaped) { switch (chr) { case 't': this._node.value += '\t'; break; case 'n': this._node.value += '\n'; break; case 'r': this._node.value += '\r'; break; default: this._node.value += chr; } this._escaped = false; } else { if (chr === this._node.quote) { this._state = this.states.none; } else if (chr === '\\') { this._escaped = true; break; } else { this._node.value += chr; } this._escaped = false; } break; case this.states.key: if (!chr.match(this.symbols.key)) { if (!this._node.value.match(this.symbols.keyNames)) { const err = new SyntaxError(`Error parsing PO data: Invalid key name "${this._node.value}" at line ${this._lineNumber}. This can be caused by an unescaped quote character in a msgid or msgstr value.`); err.lineNumber = this._lineNumber; throw err; } this._state = this.states.none; i--; } else { this._node.value += chr; } break; } } }; /** * Join multi line strings * * @param {Object} tokens Parsed tokens * @return {Object} Parsed tokens, with multi line strings joined into one */ Parser.prototype._joinStringValues = function (tokens) { const response = []; let lastNode; for (let i = 0, len = tokens.length; i < len; i++) { if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) { lastNode.value += tokens[i].value; } else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) { lastNode.value += '\n' + tokens[i].value; } else { response.push(tokens[i]); lastNode = tokens[i]; } } return response; }; /** * Parse comments into separate comment blocks * * @param {Object} tokens Parsed tokens */ Parser.prototype._parseComments = function (tokens) { // parse comments tokens.forEach(node => { if (!node || node.type !== this.types.comments) { return; } const comment = { translator: [], extracted: [], reference: [], flag: [], previous: [] }; const lines = (node.value || '').split(/\n/); lines.forEach(line => { switch (line.charAt(0) || '') { case ':': comment.reference.push(line.substr(1).trim()); break; case '.': comment.extracted.push(line.substr(1).replace(/^\s+/, '')); break; case ',': comment.flag.push(line.substr(1).replace(/^\s+/, '')); break; case '|': comment.previous.push(line.substr(1).replace(/^\s+/, '')); break; case '~': break; default: comment.translator.push(line.replace(/^\s+/, '')); } }); node.value = {}; Object.keys(comment).forEach(key => { if (comment[key] && comment[key].length) { node.value[key] = comment[key].join('\n'); } }); }); }; /** * Join gettext keys with values * * @param {Object} tokens Parsed tokens * @return {Object} Tokens */ Parser.prototype._handleKeys = function (tokens) { const response = []; let lastNode; for (let i = 0, len = tokens.length; i < len; i++) { if (tokens[i].type === this.types.key) { lastNode = { key: tokens[i].value }; if (tokens[i].obsolete) { lastNode.obsolete = true; } if (i && tokens[i - 1].type === this.types.comments) { lastNode.comments = tokens[i - 1].value; } lastNode.value = ''; response.push(lastNode); } else if (tokens[i].type === this.types.string && lastNode) { lastNode.value += tokens[i].value; } } return response; }; /** * Separate different values into individual translation objects * * @param {Object} tokens Parsed tokens * @return {Object} Tokens */ Parser.prototype._handleValues = function (tokens) { const response = []; let lastNode; let curContext; let curComments; for (let i = 0, len = tokens.length; i < len; i++) { if (tokens[i].key.toLowerCase() === 'msgctxt') { curContext = tokens[i].value; curComments = tokens[i].comments; } else if (tokens[i].key.toLowerCase() === 'msgid') { lastNode = { msgid: tokens[i].value }; if (tokens[i].obsolete) { lastNode.obsolete = true; } if (curContext) { lastNode.msgctxt = curContext; } if (curComments) { lastNode.comments = curComments; } if (tokens[i].comments && !lastNode.comments) { lastNode.comments = tokens[i].comments; } curContext = false; curComments = false; response.push(lastNode); } else if (tokens[i].key.toLowerCase() === 'msgid_plural') { if (lastNode) { if (this._validation && 'msgid_plural' in lastNode) { throw new SyntaxError(`Multiple msgid_plural error: entry "${lastNode.msgid}" in "${lastNode.msgctxt || ''}" context has multiple msgid_plural declarations.`); } lastNode.msgid_plural = tokens[i].value; } if (tokens[i].comments && !lastNode.comments) { lastNode.comments = tokens[i].comments; } curContext = false; curComments = false; } else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') { if (lastNode) { lastNode.msgstr = (lastNode.msgstr || []).concat(tokens[i].value); } if (tokens[i].comments && !lastNode.comments) { lastNode.comments = tokens[i].comments; } curContext = false; curComments = false; } } return response; }; /** * Validate token * * @param {Object} token Parsed token * @param {Object} translations Translation table * @param {string} msgctxt Message entry context * @param {number} nplurals Number of epected plural forms * @throws Will throw an error if token validation fails */ Parser.prototype._validateToken = function ( { msgid = '', msgid_plural = '', // eslint-disable-line camelcase msgstr = [] }, translations, msgctxt, nplurals ) { if (!this._validation) { return; } if (msgid in translations[msgctxt]) { throw new SyntaxError(`Duplicate msgid error: entry "${msgid}" in "${msgctxt}" context has already been declared.`); // eslint-disable-next-line camelcase } else if (msgid_plural && msgstr.length !== nplurals) { // eslint-disable-next-line camelcase throw new RangeError(`Plural forms range error: Expected to find ${nplurals} forms but got ${msgstr.length} for entry "${msgid_plural}" in "${msgctxt}" context.`); // eslint-disable-next-line camelcase } else if (!msgid_plural && msgstr.length !== 1) { throw new RangeError(`Translation string range error: Extected 1 msgstr definitions associated with "${msgid}" in "${msgctxt}" context, found ${msgstr.length}.`); } }; /** * Compose a translation table from tokens object * * @param {Object} tokens Parsed tokens * @return {Object} Translation table */ Parser.prototype._normalize = function (tokens) { const table = { charset: this._charset, headers: undefined, translations: {} }; let nplurals = 1; let msgctxt; for (let i = 0, len = tokens.length; i < len; i++) { msgctxt = tokens[i].msgctxt || ''; if (tokens[i].obsolete) { if (!table.obsolete) { table.obsolete = {}; } if (!table.obsolete[msgctxt]) { table.obsolete[msgctxt] = {}; } delete tokens[i].obsolete; table.obsolete[msgctxt][tokens[i].msgid] = tokens[i]; continue; } if (!table.translations[msgctxt]) { table.translations[msgctxt] = {}; } if (!table.headers && !msgctxt && !tokens[i].msgid) { table.headers = parseHeader(tokens[i].msgstr[0]); nplurals = parseNPluralFromHeadersSafely(table.headers, nplurals); } this._validateToken(tokens[i], table.translations, msgctxt, nplurals); table.translations[msgctxt][tokens[i].msgid] = tokens[i]; } return table; }; /** * Converts parsed tokens to a translation table * * @param {Object} tokens Parsed tokens * @returns {Object} Translation table */ Parser.prototype._finalize = function (tokens) { let data = this._joinStringValues(tokens); this._parseComments(data); data = this._handleKeys(data); data = this._handleValues(data); return this._normalize(data); }; /** * Creates a transform stream for parsing PO input * * @typedef {{ defaultCharset: strubg, validation: boolean }} Options * @constructor * @param {Options} options Optional options with defaultCharset and validation * @param {import('readable-stream').TransformOptions} transformOptions Optional stream options */ function PoParserTransform (options, transformOptions) { this.options = options; this._parser = false; this._tokens = {}; this._cache = []; this._cacheSize = 0; this.initialTreshold = transformOptions.initialTreshold || 2 * 1024; Transform.call(this, transformOptions); this._writableState.objectMode = false; this._readableState.objectMode = true; } util.inherits(PoParserTransform, Transform); /** * Processes a chunk of the input stream */ PoParserTransform.prototype._transform = function (chunk, encoding, done) { let i; let len = 0; if (!chunk || !chunk.length) { return done(); } if (!this._parser) { this._cache.push(chunk); this._cacheSize += chunk.length; // wait until the first 1kb before parsing headers for charset if (this._cacheSize < this.initialTreshold) { return setImmediate(done); } else if (this._cacheSize) { chunk = Buffer.concat(this._cache, this._cacheSize); this._cacheSize = 0; this._cache = []; } this._parser = new Parser(chunk, this.options); } else if (this._cacheSize) { // this only happens if we had an uncompleted 8bit sequence from the last iteration this._cache.push(chunk); this._cacheSize += chunk.length; chunk = Buffer.concat(this._cache, this._cacheSize); this._cacheSize = 0; this._cache = []; } // cache 8bit bytes from the end of the chunk // helps if the chunk ends in the middle of an utf-8 sequence for (i = chunk.length - 1; i >= 0; i--) { if (chunk[i] >= 0x80) { len++; continue; } break; } // it seems we found some 8bit bytes from the end of the string, so let's cache these if (len) { this._cache = [chunk.slice(chunk.length - len)]; this._cacheSize = this._cache[0].length; chunk = chunk.slice(0, chunk.length - len); } // chunk might be empty if it only continued of 8bit bytes and these were all cached if (chunk.length) { try { this._parser._lexer(this._parser._toString(chunk)); } catch (error) { setImmediate(() => { done(error); }); return; } } setImmediate(done); }; /** * Once all input has been processed emit the parsed translation table as an object */ PoParserTransform.prototype._flush = function (done) { let chunk; if (this._cacheSize) { chunk = Buffer.concat(this._cache, this._cacheSize); } if (!this._parser && chunk) { this._parser = new Parser(chunk, this.options); } if (chunk) { try { this._parser._lexer(this._parser._toString(chunk)); } catch (error) { setImmediate(() => { done(error); }); return; } } if (this._parser) { this.push(this._parser._finalize(this._parser._lex)); } setImmediate(done); };