@rpidanny/pdf2md
Version:
A PDF to Markdown Converter
76 lines (67 loc) • 2.17 kB
JavaScript
// @flow
const ToLineItemTransformation = require('../ToLineItemTransformation')
const ParseResult = require('../../ParseResult')
const LineItem = require('../../LineItem')
const StashingStream = require('../../StashingStream')
const { REMOVED_ANNOTATION, ADDED_ANNOTATION } = require('../../Annotation')
// Converts vertical text to horizontal
module.exports = class VerticalToHorizontal extends ToLineItemTransformation {
constructor () {
super('Vertical to Horizontal Text')
}
transform (parseResult /*: ParseResult */) /*: ParseResult */ {
var foundVerticals = 0
parseResult.pages.forEach(page => {
const stream = new VerticalsStream()
stream.consumeAll(page.items)
page.items = stream.complete()
foundVerticals += stream.foundVerticals
})
return new ParseResult({
...parseResult,
messages: ['Converted ' + foundVerticals + ' verticals'],
})
}
}
class VerticalsStream extends StashingStream {
constructor () {
super()
this.foundVerticals = 0
}
shouldStash (item) {
return item.words.length === 1 && item.words[0].string.length === 1
}
doMatchesStash (lastItem, item) {
return lastItem.y - item.y > 5 && lastItem.words[0].type === item.words[0].type
}
doFlushStash (stash, results) {
if (stash.length > 5) { // unite
var combinedWords = []
var minX = 999
var maxY = 0
var sumWidth = 0
var maxHeight = 0
stash.forEach(oneCharacterLine => {
oneCharacterLine.annotation = REMOVED_ANNOTATION
results.push(oneCharacterLine)
combinedWords.push(oneCharacterLine.words[0])
minX = Math.min(minX, oneCharacterLine.x)
maxY = Math.max(maxY, oneCharacterLine.y)
sumWidth += oneCharacterLine.width
maxHeight = Math.max(maxHeight, oneCharacterLine.height)
})
results.push(new LineItem({
...stash[0],
x: minX,
y: maxY,
width: sumWidth,
height: maxHeight,
words: combinedWords,
annotation: ADDED_ANNOTATION,
}))
this.foundVerticals++
} else { // add as singles
results.push(...stash)
}
}
}