fetch-fic
Version: 
Package up delicious, delicious fanfic from various sources into epub ebooks ready for reading in your ereader of choice.
299 lines (290 loc) • 7.51 kB
JavaScript
'use strict'
const iconv = require('iconv-lite')
const Transform = require('stream').Transform
const pump = require('pump')
const normalizeHtml = require('./normalize-html.js')
const concat = require('concat-stream')
const Bluebird = require('bluebird')
class ParseRTF extends Transform {
  constructor () {
    super({objectMode: true})
    this.charset = 'ascii' // 'windows-1252'
    this.char = null
    this.path = []
    this.ctx = null
    this.buffer = ''
    this.cmd = ''
    this.arg = ''
    this.parserState = this.parserGeneral
    this.uc = 1
  }
  _transform (buf, encoding, done) {
    const text = buf.toString('ascii')
    for (let ii = 0; ii < text.length; ++ii) {
      this.parserState(text[ii])
    }
    done()
  }
  _flush (done) {
    this.endStr()
    this.ctx = this.path[0] || this.ctx
    if (this !== this.ctx && this.ctx) {
      this.push(this.ctx)
    }
    done()
  }
  parserGeneral (char) {
    if (char === '{') {
      this.endStr()
      this.path.push(this.ctx)
      if (this.ctx == null) {
        this.ctx = this
      } else {
        this.ctx = doesPushable({command: 'group', args: []})
        this.ctx.uc = this.uc
      }
    } else if (char === '}') {
      this.endStr()
      const finished = this.ctx
      this.ctx = this.path.pop()
      if (this.ctx) this.ctx.push(finished)
    } else if (char === '\\') {
      this.parserState = this.parserCmd0
    } else {
      if (char !== '\n') this.buffer += char
    }
  }
  parserCmd0 (char) {
    if (char === '\n') {
      this.endStr()
      this.ctx.push({command: 'fakeParagraph', args: []})
      this.parserState = this.parserGeneral
    } else if (char === '\'') {
      this.parserState = this.parserHexCharArgs
      this.arg = ''
    } else {
      this.cmd = char
      this.parserState = this.parserCmd1
    }
  }
  parserHexCharArgs (char) {
    this.arg += char
    if (this.arg.length === 2) {
      this.buffer += iconv.decode(new Buffer(this.arg, 'hex'), this.charset)
      this.parserState = this.parserGeneral
    }
  }
  parserCmd1 (char) {
    if (/^[a-zA-Z*]$/.test(char)) {
      this.cmd += char
    } else {
      this.arg = ''
      this.parserState = this.parserCmd2
      this.parserState(char)
    }
  }
  parserCmd2 (char) {
    if (/^[-0-9]$/.test(char)) {
      this.arg += char
    } else {
      if (this.cmd === 'ansicpg') {
        this.charset = 'windows-' + this.arg
      } else if (this.cmd === 'u') {
        var charBuf = Buffer.alloc ? Buffer.alloc(2) : new Buffer(2)
        charBuf.writeUInt16LE(Number(this.arg), 0)
        this.buffer += iconv.decode(charBuf, 'ucs2')
        if (this.ctx.uc) return this.skipChars(this.ctx.uc)
      } else {
        this.endStr()
        this.ctx.push({command: this.cmd, args: [this.arg]})
      }
      this.parserState = this.parserGeneral
      this.parserState(char)
    }
  }
  skipChars (num) {
    if (num > 1) {
      this.parserState = () => this.skipChars(num - 1)
    } else {
      this.parserState = this.parserGeneral
    }
  }
  endStr () {
    if (this.buffer === '') return
    (this.ctx || this).push({command: 'text', args: [this.buffer]})
    this.buffer = ''
  }
}
const Pushable = {
  push (value) {
    this.args.push(value)
  }
}
function doesPushable (obj) {
  return Object.assign(Object.create(Pushable), obj)
}
function evaluate (exp, state, content) {
  return htmlExpression[exp.command] && htmlExpression[exp.command](exp, state, content)
}
// We assume formatting expressions are paired as they have to be in HTML.
// They don't in RTF, so it's entirely possible for valid RTF to produce weird HTML.
// This will likely be cleaned up by the parse/serialize step, but still.
const htmlExpression = {
  group: (exp, state, content) => {
    const group = exp.args.shift()
    return evaluate(group, state, exp.args.map(arg => evaluate(arg, state)))
  },
  fakeParagraph: (exp, state) => {
    if (!state.inPara) return ''
    state.inPara = false
    return '</p>\n'
  },
  text: (exp) => {
    return exp.args[0]
  },
  b: (exp) => {
    if (exp.args[0] === '0') {
      return '</strong>'
    } else {
      return '<strong>'
    }
  },
  i: (exp) => {
    if (exp.args[0] === '0') {
      return '</em>'
    } else {
      return '<em>'
    }
  },
  ul: (exp) => {
    if (exp.args[0] === '0') {
      return '</span>'
    } else {
      return '<span style="text-decoration: underline">'
    }
  },
  ulnone: (exp) => {
    return '</span>'
  },
  field: (exp, state, content) => {
    const args = []
    for (let item of content) {
      args.push.apply(args, item)
    }
    const matchHyperlink = /HYPERLINK "(.*)"/
    const href = args[0].match(matchHyperlink)
    if (href) {
      return `<a href="${href[1]}">${args.slice(1).join('')}</a>`
    } else {
//      console.error('FIELD', args)
      return args[1]
    }
  },
  '*': (exp, state, content) => {
    return content.filter(c => c !== '')
  },
  fldinst: (exp, state, content) => {
    return ''
  },
  fldrslt: (exp, state, content) => {
    return content.filter(c => c !== '')
  },
  fs: (exp, state) => {
    state.style.size = Number(exp.args[0]) / 2
    return ''
  },
  pard: (exp, state) => {
    state.style = Object.assign({}, state.defaultStyle)
    return ''
  },
  qc: (exp, state) => {
    state.style.align = 'center'
    return ''
  },
  ql: (exp, state) => {
    state.style.align = 'left'
    return ''
  },
  qr: (exp, state) => {
    state.style.align = 'right'
    return ''
  },
  qj: (exp, state) => {
    state.style.align = 'justify'
    return ''
  },
  cf: exp => {
// foreground color
    return ''
  },
  cb: exp => {
// background color
    return ''
  }
}
class ToHTML extends Transform {
  constructor () {
    super({objectMode: true})
    this.inPara = null
    this.defaultStyle = {
      size: 10,
      align: 'left'
    }
    this.style = Object.assign({}, this.defaultStyle)
    this.lastStyle = this.styleStr()
  }
  styleStr () {
    let style = []
    if (this.style.size !== this.defaultStyle.size) {
      const em = this.style.size / this.defaultStyle.size
      style.push(`font-size: ${em}em;`)
    }
    if (this.style.align !== this.defaultStyle.align) {
      style.push(`text-align: ${this.style.align};`)
    }
    if (!style.length) return ''
    return ` style="${style.join(' ')}"`
  }
  _transform (exp, encoding, done) {
    const inPara = this.inPara
    const result = evaluate(exp, this)
    if (result) {
      let startP = ''
      if (!inPara) {
        this.inPara = true
        this.lastStyle = this.styleStr()
        startP = `<p${this.lastStyle}>`
      }
      if (/S/.test(result) && this.lastStyle !== this.styleStr()) {
        this.lastStyle = this.styleStr()
        this.push(`<span${this.lastStyle}>${startP}${result}</span>`)
      } else {
        this.push(`${startP}${result}`)
      }
    } else {
//      if (result == null) console.error(exp)
    }
    done()
  }
  _flush (done) {
    if (this.inPara) this.push('</p>\n')
    done()
  }
}
module.exports = function (rtf) {
  return Promise.resolve(rtf).then((rtf) => {
    var parser = new ParseRTF()
    return new Bluebird((resolve, reject) => {
      pump(
        parser,
        new ToHTML(),
        concat(data => {
          const emptyInlineTags = /<(em|strong|span)[^>]*>(\s*)<[/]\1>/g
          const html = normalizeHtml(data).replace(emptyInlineTags, '$2')
          resolve(html)
        }),
        err => err && reject(err))
      parser.end(rtf)
    })
  })
}