rtf-parser
Version:
This is a general RTF parser. It takes a text stream and produces a document object representing the parsed document. In and of itself, this isn't super useful but it's the building block for other tools to convert RTF into other formats.
423 lines (402 loc) • 11 kB
JavaScript
'use strict'
const assert = require('assert')
const util = require('util')
const Writable = require('readable-stream').Writable
const RTFGroup = require('./rtf-group.js')
const RTFParagraph = require('./rtf-paragraph.js')
const RTFSpan = require('./rtf-span.js')
const iconv = require('iconv-lite')
const availableCP = [
437, 737, 775, 850, 852, 853, 855, 857, 858, 860, 861, 863, 865, 866,
869, 932, 936, 949, 950, 1125, 1250, 1251, 1252, 1253, 1254, 1257 ]
const codeToCP = {
0: 'ASCII',
2: 'SYMBOL',
77: 'MacRoman',
128: 'SHIFT_JIS',
129: 'CP949', // Hangul
130: 'JOHAB',
134: 'CP936', // GB2312 simplified chinese
136: 'BIG5',
161: 'CP1253', // greek
162: 'CP1254', // turkish
163: 'CP1258', // vietnamese
177: 'CP862', // hebrew
178: 'CP1256', // arabic
186: 'CP1257', // baltic
204: 'CP1251', // russian
222: 'CP874', // thai
238: 'CP238', // eastern european
254: 'CP437' // PC-437
}
class RTFInterpreter extends Writable {
constructor (document) {
super({objectMode: true})
this.doc = document
this.parserState = this.parseTop
this.groupStack = []
this.group = null
this.once('prefinish', () => this.finisher())
this.hexStore = []
}
_write (cmd, encoding, done) {
const method = 'cmd$' + cmd.type.replace(/-(.)/g, (_, char) => char.toUpperCase())
if (this[method]) {
this[method](cmd)
} else {
process.emit('error', `Unknown RTF command ${cmd.type}, tried ${method}`)
}
done()
}
finisher () {
while (this.groupStack.length) this.cmd$groupEnd()
const initialStyle = this.doc.content.length ? this.doc.content[0].style : []
for (let prop of Object.keys(this.doc.style)) {
let match = true
for (let para of this.doc.content) {
if (initialStyle[prop] !== para.style[prop]) {
match = false
break
}
}
if (match) this.doc.style[prop] = initialStyle[prop]
}
}
flushHexStore () {
if (this.hexStore.length > 0) {
let hexstr = this.hexStore.map(cmd => cmd.value).join('')
this.group.addContent(new RTFSpan({
value: iconv.decode(
Buffer.from(hexstr, 'hex'), this.group.get('charset'))
}))
this.hexStore.splice(0)
}
}
cmd$groupStart () {
this.flushHexStore()
if (this.group) this.groupStack.push(this.group)
this.group = new RTFGroup(this.group || this.doc)
}
cmd$ignorable () {
this.flushHexStore()
this.group.ignorable = true
}
cmd$endParagraph () {
this.flushHexStore()
this.group.addContent(new RTFParagraph())
}
cmd$groupEnd () {
this.flushHexStore()
const endingGroup = this.group
this.group = this.groupStack.pop()
const doc = this.group || this.doc
if (endingGroup instanceof FontTable) {
doc.fonts = endingGroup.table
} else if (endingGroup instanceof ColorTable) {
doc.colors = endingGroup.table
} else if (endingGroup !== this.doc && !endingGroup.get('ignorable')) {
for (const item of endingGroup.content) {
doc.addContent(item)
}
process.emit('debug', 'GROUP END', endingGroup.type, endingGroup.get('ignorable'))
}
}
cmd$text (cmd) {
this.flushHexStore()
if (!this.group) { // an RTF fragment, missing the {\rtf1 header
this.group = this.doc
}
this.group.addContent(new RTFSpan(cmd))
}
cmd$controlWord (cmd) {
this.flushHexStore()
if (!this.group.type) this.group.type = cmd.value
const method = 'ctrl$' + cmd.value.replace(/-(.)/g, (_, char) => char.toUpperCase())
if (this[method]) {
this[method](cmd.param)
} else {
if (!this.group.get('ignorable')) process.emit('debug', method, cmd.param)
}
}
cmd$hexchar (cmd) {
this.hexStore.push(cmd)
}
cmd$error (cmd) {
this.emit('error', new Error('Error: ' + cmd.value + (cmd.row && cmd.col ? ' at line ' + cmd.row + ':' + cmd.col : '') + '.'))
}
ctrl$rtf () {
this.group = this.doc
}
// new line
ctrl$line () {
this.group.addContent(new RTFSpan({ value: '\n' }))
}
// tab
ctrl$tab () {
this.group.addContent(new RTFSpan({ value: '\t' }))
}
// alignment
ctrl$qc () {
this.group.style.align = 'center'
}
ctrl$qj () {
this.group.style.align = 'justify'
}
ctrl$ql () {
this.group.style.align = 'left'
}
ctrl$qr () {
this.group.style.align = 'right'
}
// text direction
ctrl$rtlch () {
this.group.style.dir = 'rtl'
}
ctrl$ltrch () {
this.group.style.dir = 'ltr'
}
// general style
ctrl$par () {
this.group.addContent(new RTFParagraph())
}
ctrl$pard () {
this.group.resetStyle()
}
ctrl$plain () {
this.group.style.fontSize = this.doc.getStyle('fontSize')
this.group.style.bold = this.doc.getStyle('bold')
this.group.style.italic = this.doc.getStyle('italic')
this.group.style.underline = this.doc.getStyle('underline')
}
ctrl$b (set) {
this.group.style.bold = set !== 0
}
ctrl$i (set) {
this.group.style.italic = set !== 0
}
ctrl$u (num) {
var charBuf = Buffer.alloc ? Buffer.alloc(2) : new Buffer(2)
// RTF, for reasons, represents unicode characters as signed integers
// thus managing to match literally no one.
charBuf.writeInt16LE(num, 0)
this.group.addContent(new RTFSpan({value: iconv.decode(charBuf, 'ucs2')}))
}
ctrl$super () {
this.group.style.valign = 'super'
}
ctrl$sub () {
this.group.style.valign = 'sub'
}
ctrl$nosupersub () {
this.group.style.valign = 'normal'
}
ctrl$strike (set) {
this.group.style.strikethrough = set !== 0
}
ctrl$ul (set) {
this.group.style.underline = set !== 0
}
ctrl$ulnone (set) {
this.group.style.underline = false
}
ctrl$fi (value) {
this.group.style.firstLineIndent = value
}
ctrl$cufi (value) {
this.group.style.firstLineIndent = value * 100
}
ctrl$li (value) {
this.group.style.indent = value
}
ctrl$lin (value) {
this.group.style.indent = value
}
ctrl$culi (value) {
this.group.style.indent = value * 100
}
// encodings
ctrl$ansi () {
this.group.charset = 'ASCII'
}
ctrl$mac () {
this.group.charset = 'MacRoman'
}
ctrl$pc () {
this.group.charset = 'CP437'
}
ctrl$pca () {
this.group.charset = 'CP850'
}
ctrl$ansicpg (codepage) {
if (availableCP.indexOf(codepage) === -1) {
this.emit('error', new Error('Codepage ' + codepage + ' is not available.'))
} else {
this.group.charset = 'CP' + codepage
}
}
// fonts
ctrl$fonttbl () {
this.group = new FontTable(this.group.parent)
}
ctrl$f (num) {
if (this.group instanceof FontTable) {
this.group.currentFont = this.group.table[num] = new Font()
} else if (this.group.parent instanceof FontTable) {
this.group.parent.currentFont = this.group.parent.table[num] = new Font()
} else {
this.group.style.font = num
let fontCharset = this.group.getFont(num).charset
fontCharset = fontCharset && fontCharset !== 'ASCII' ? fontCharset : this.group.charset// default font charset
this.group.charset = fontCharset
}
}
ctrl$fnil () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'nil'
}
}
ctrl$froman () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'roman'
}
}
ctrl$fswiss () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'swiss'
}
}
ctrl$fmodern () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'modern'
}
}
ctrl$fscript () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'script'
}
}
ctrl$fdecor () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'decor'
}
}
ctrl$ftech () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'tech'
}
}
ctrl$fbidi () {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').family = 'bidi'
}
}
ctrl$fcharset (code) {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
let charset = null
if (code === 1) {
charset = this.group.get('charset')
} else {
charset = codeToCP[code]
}
if (charset == null) {
return this.emit('error', new Error('Unsupported charset code #' + code))
}
this.group.get('currentFont').charset = charset
}
}
ctrl$fprq (pitch) {
if (this.group instanceof FontTable || this.group.parent instanceof FontTable) {
this.group.get('currentFont').pitch = pitch
}
}
// colors
ctrl$colortbl () {
this.group = new ColorTable(this.group.parent)
}
ctrl$red (value) {
if (this.group instanceof ColorTable) {
this.group.red = value
}
}
ctrl$blue (value) {
if (this.group instanceof ColorTable) {
this.group.blue = value
}
}
ctrl$green (value) {
if (this.group instanceof ColorTable) {
this.group.green = value
}
}
ctrl$cf (value) {
this.group.style.foreground = value
}
ctrl$cb (value) {
this.group.style.background = value
}
ctrl$fs (value) {
this.group.style.fontSize = value
}
// margins
ctrl$margl (value) {
this.doc.marginLeft = value
}
ctrl$margr (value) {
this.doc.marginRight = value
}
ctrl$margt (value) {
this.doc.marginTop = value
}
ctrl$margb (value) {
this.doc.marginBottom = value
}
// unsupported (and we need to ignore content)
ctrl$stylesheet (value) {
this.group.ignorable = true
}
ctrl$info (value) {
this.group.ignorable = true
}
ctrl$mmathPr (value) {
this.group.ignorable = true
}
}
class FontTable extends RTFGroup {
constructor (parent) {
super(parent)
this.table = []
this.currentFont = {family: 'roman', charset: 'ASCII', name: 'Serif'}
}
addContent (text) {
this.currentFont.name += text.value.replace(/;\s*$/, '')
}
}
class Font {
constructor () {
this.family = null
this.charset = null
this.name = ''
this.pitch = 0
}
}
class ColorTable extends RTFGroup {
constructor (parent) {
super(parent)
this.table = []
this.red = 0
this.blue = 0
this.green = 0
}
addContent (text) {
assert(text.value === ';', 'got: ' + util.inspect(text))
this.table.push({
red: this.red,
blue: this.blue,
green: this.green
})
this.red = 0
this.blue = 0
this.green = 0
}
}
module.exports = RTFInterpreter