rtf-stream-parser
Version:
Stream Transform class to tokenize RTF, and another to de-encapsulate text or HTML
214 lines (213 loc) • 7.72 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ProcessTokens = exports.procTokensDefaultOptions = void 0;
const stream_1 = require("stream");
const decode_1 = require("./decode");
const utils_1 = require("./utils");
const defaultStringDecoder = (buf, enc) => buf.toString(enc);
const defaultStringEncoder = (str, enc) => Buffer.from(str, enc);
exports.procTokensDefaultOptions = {
decode: defaultStringDecoder,
encode: defaultStringEncoder,
outputMode: 'string',
replaceSymbolFontChars: false,
warn: console.warn
};
const knownSymbolFontNames = {
Wingdings: true,
'Wingdings 2': true,
'Wingdings 3': true,
Webdings: true,
Symbol: true,
};
function isKnownSymbolFont(thisFont) {
return !!thisFont && (thisFont.fcharsetCpg === 42
|| thisFont.cpg === 42
|| knownSymbolFontNames[thisFont.fontName || ''] === true);
}
class ProcessTokens extends stream_1.Transform {
constructor(options) {
super({ writableObjectMode: true, readableObjectMode: true });
this._rootState = { uc: 1, groupDepth: 0, destDepth: 0, destGroupDepth: 0 };
this._state = this._rootState;
this._cpg = 1252;
this._count = 0;
this._lastLastToken = null;
this._lastToken = null;
this._currToken = null;
this._done = false;
this._ansicpg = false;
this._skip = 0;
this._options = Object.assign(Object.assign({}, exports.procTokensDefaultOptions), options);
this._pushOutput = this._pushOutput.bind(this);
}
get defaultCodepage() {
return this._cpg;
}
_getOutputAsString(data, font) {
let outStr;
let areSymbolFontCodepoints = false;
if (font && isKnownSymbolFont(font)) {
const chunks = [];
if (utils_1.isStr(data)) {
for (const c of data) {
const codepoint = c.codePointAt(0);
if ((codepoint >= 0 && codepoint <= 0xFF) || (codepoint >= 0xF000 && codepoint <= 0xF0FF)) {
chunks.push(String.fromCodePoint(codepoint % 0xF000));
}
else {
chunks.push(String.fromCodePoint(codepoint));
}
}
}
else {
chunks.push(data.toString('latin1'));
}
const str1 = chunks.join('');
const fontname = font.fontName;
if (fontname
&& (this._options.replaceSymbolFontChars === true
|| (this._options.replaceSymbolFontChars && this._options.replaceSymbolFontChars[fontname]))) {
const str2 = decode_1.recodeSymbolFontText(str1, fontname, 'keep');
outStr = str2 || '';
}
else {
outStr = str1;
areSymbolFontCodepoints = true;
}
}
else if (utils_1.isStr(data)) {
outStr = data;
}
else {
const cpg = font
? font.cpg || font.fcharsetCpg || this._cpg
: this._cpg;
if (cpg === 20127 || cpg === 65001) {
outStr = data.toString('utf8');
}
else if (cpg === 1200) {
outStr = data.toString('utf16le');
}
else if (cpg || this._options.allowCp0) {
outStr = this._options.decode(data, 'cp' + cpg);
}
else {
throw new Error('text with no codepage');
}
}
return [outStr, areSymbolFontCodepoints];
}
_pushOutputData(outStr, areSymbolFontCodepoints) {
if (this._options.outputMode === 'buffer-utf8') {
this.push(Buffer.from(outStr, 'utf8'));
}
else if (this._options.outputMode === 'buffer-default-cpg' && this._options.encode) {
if (this._cpg === 20127 || this._cpg === 65001) {
this.push(Buffer.from(outStr, 'utf8'));
}
else if (this._cpg === 1200) {
this.push(Buffer.from(outStr, 'utf16le'));
}
else if (areSymbolFontCodepoints) {
const bytes = [];
for (const c of outStr) {
const codepoint = c.charCodeAt(0);
if (codepoint > 0xFF) {
bytes.push(0x20);
}
else {
bytes.push(codepoint);
}
}
this.push(Buffer.from(bytes));
}
else {
try {
const buf = this._options.encode(outStr, 'cp' + this._cpg);
this.push(buf);
}
catch (err) {
this._options.warn('Unable to encode to cp' + this._cpg);
}
}
}
else {
this.push(outStr);
}
}
_getCurrentFont() {
const state = this._state;
const f = state.font || this._deff || '';
const finfo = this._fonttbl && this._fonttbl[f];
return finfo;
}
_pushOutput(data) {
for (const feature of this._featureHandlers) {
if (feature.outputDataFilter) {
const handled = feature.outputDataFilter(this, data);
if (handled) {
return;
}
}
}
const font = this._getCurrentFont();
const [outStr, areSymbolFontCodepoints] = this._getOutputAsString(data, font);
this._pushOutputData(outStr, areSymbolFontCodepoints);
}
_handleToken(token) {
try {
for (const feature of this._featureHandlers) {
if (feature.allTokenHandler) {
const result = feature.allTokenHandler(this, token);
if (result) {
return;
}
}
}
for (const feature of this._featureHandlers) {
if (feature.tokenHandlers) {
const tokenHandler = feature.tokenHandlers[token.type];
if (tokenHandler) {
const result = tokenHandler(this, token);
if (result) {
return;
}
}
}
}
if (token.type === 2) {
for (const feature of this._featureHandlers) {
if (feature.controlHandlers && feature.controlHandlers[token.word]) {
const result = feature.controlHandlers[token.word](this, token);
if (result) {
return;
}
}
}
}
}
catch (err) {
return err;
}
}
_transform(token, encoding, cb) {
const error = this._handleToken(token);
cb(error);
}
_flush(cb) {
let error;
try {
for (const feature of this._featureHandlers) {
if (feature.preStreamFlushHandler) {
feature.preStreamFlushHandler(this);
}
}
}
catch (err) {
error = err;
}
cb(error);
}
}
exports.ProcessTokens = ProcessTokens;