UNPKG

rtf-stream-parser

Version:

Stream Transform class to tokenize RTF, and another to de-encapsulate text or HTML

214 lines (213 loc) 7.72 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ProcessTokens = exports.procTokensDefaultOptions = void 0; const stream_1 = require("stream"); const decode_1 = require("./decode"); const utils_1 = require("./utils"); const defaultStringDecoder = (buf, enc) => buf.toString(enc); const defaultStringEncoder = (str, enc) => Buffer.from(str, enc); exports.procTokensDefaultOptions = { decode: defaultStringDecoder, encode: defaultStringEncoder, outputMode: 'string', replaceSymbolFontChars: false, warn: console.warn }; const knownSymbolFontNames = { Wingdings: true, 'Wingdings 2': true, 'Wingdings 3': true, Webdings: true, Symbol: true, }; function isKnownSymbolFont(thisFont) { return !!thisFont && (thisFont.fcharsetCpg === 42 || thisFont.cpg === 42 || knownSymbolFontNames[thisFont.fontName || ''] === true); } class ProcessTokens extends stream_1.Transform { constructor(options) { super({ writableObjectMode: true, readableObjectMode: true }); this._rootState = { uc: 1, groupDepth: 0, destDepth: 0, destGroupDepth: 0 }; this._state = this._rootState; this._cpg = 1252; this._count = 0; this._lastLastToken = null; this._lastToken = null; this._currToken = null; this._done = false; this._ansicpg = false; this._skip = 0; this._options = Object.assign(Object.assign({}, exports.procTokensDefaultOptions), options); this._pushOutput = this._pushOutput.bind(this); } get defaultCodepage() { return this._cpg; } _getOutputAsString(data, font) { let outStr; let areSymbolFontCodepoints = false; if (font && isKnownSymbolFont(font)) { const chunks = []; if (utils_1.isStr(data)) { for (const c of data) { const codepoint = c.codePointAt(0); if ((codepoint >= 0 && codepoint <= 0xFF) || (codepoint >= 0xF000 && codepoint <= 0xF0FF)) { chunks.push(String.fromCodePoint(codepoint % 0xF000)); } else { chunks.push(String.fromCodePoint(codepoint)); } } } else { chunks.push(data.toString('latin1')); } const str1 = chunks.join(''); const fontname = font.fontName; if (fontname && (this._options.replaceSymbolFontChars === true || (this._options.replaceSymbolFontChars && this._options.replaceSymbolFontChars[fontname]))) { const str2 = decode_1.recodeSymbolFontText(str1, fontname, 'keep'); outStr = str2 || ''; } else { outStr = str1; areSymbolFontCodepoints = true; } } else if (utils_1.isStr(data)) { outStr = data; } else { const cpg = font ? font.cpg || font.fcharsetCpg || this._cpg : this._cpg; if (cpg === 20127 || cpg === 65001) { outStr = data.toString('utf8'); } else if (cpg === 1200) { outStr = data.toString('utf16le'); } else if (cpg || this._options.allowCp0) { outStr = this._options.decode(data, 'cp' + cpg); } else { throw new Error('text with no codepage'); } } return [outStr, areSymbolFontCodepoints]; } _pushOutputData(outStr, areSymbolFontCodepoints) { if (this._options.outputMode === 'buffer-utf8') { this.push(Buffer.from(outStr, 'utf8')); } else if (this._options.outputMode === 'buffer-default-cpg' && this._options.encode) { if (this._cpg === 20127 || this._cpg === 65001) { this.push(Buffer.from(outStr, 'utf8')); } else if (this._cpg === 1200) { this.push(Buffer.from(outStr, 'utf16le')); } else if (areSymbolFontCodepoints) { const bytes = []; for (const c of outStr) { const codepoint = c.charCodeAt(0); if (codepoint > 0xFF) { bytes.push(0x20); } else { bytes.push(codepoint); } } this.push(Buffer.from(bytes)); } else { try { const buf = this._options.encode(outStr, 'cp' + this._cpg); this.push(buf); } catch (err) { this._options.warn('Unable to encode to cp' + this._cpg); } } } else { this.push(outStr); } } _getCurrentFont() { const state = this._state; const f = state.font || this._deff || ''; const finfo = this._fonttbl && this._fonttbl[f]; return finfo; } _pushOutput(data) { for (const feature of this._featureHandlers) { if (feature.outputDataFilter) { const handled = feature.outputDataFilter(this, data); if (handled) { return; } } } const font = this._getCurrentFont(); const [outStr, areSymbolFontCodepoints] = this._getOutputAsString(data, font); this._pushOutputData(outStr, areSymbolFontCodepoints); } _handleToken(token) { try { for (const feature of this._featureHandlers) { if (feature.allTokenHandler) { const result = feature.allTokenHandler(this, token); if (result) { return; } } } for (const feature of this._featureHandlers) { if (feature.tokenHandlers) { const tokenHandler = feature.tokenHandlers[token.type]; if (tokenHandler) { const result = tokenHandler(this, token); if (result) { return; } } } } if (token.type === 2) { for (const feature of this._featureHandlers) { if (feature.controlHandlers && feature.controlHandlers[token.word]) { const result = feature.controlHandlers[token.word](this, token); if (result) { return; } } } } } catch (err) { return err; } } _transform(token, encoding, cb) { const error = this._handleToken(token); cb(error); } _flush(cb) { let error; try { for (const feature of this._featureHandlers) { if (feature.preStreamFlushHandler) { feature.preStreamFlushHandler(this); } } } catch (err) { error = err; } cb(error); } } exports.ProcessTokens = ProcessTokens;