UNPKG

@rpidanny/pdf2md

Version:
157 lines (143 loc) 4.73 kB
const TextItem = require('./TextItem') const Word = require('./Word') const WordType = require('./markdown/WordType') const WordFormat = require('./markdown/WordFormat') const LineItem = require('./LineItem') const StashingStream = require('./StashingStream') const ParsedElements = require('./ParsedElements') const { isNumber, isListItemCharacter } = require('../util/string-functions') const { sortByX } = require('../util/page-item-functions') // Converts text items which have been grouped to a line (through TextItemLineGrouper) to a single LineItem doing inline transformations like // 'whitespace removal', bold/emphasis annotation, link-detection, etc.. module.exports = class LineConverter { constructor (fontToFormats) { this.fontToFormats = fontToFormats } // returns a CombineResult compact (textItems /*: TextItem[] */) /*: LineItem */ { // we can't trust order of occurence, esp. footnoteLinks like to come last sortByX(textItems) const wordStream = new WordDetectionStream(this.fontToFormats) wordStream.consumeAll(textItems.map(item => new TextItem({ ...item }))) const words = wordStream.complete() var maxHeight = 0 var widthSum = 0 textItems.forEach(item => { maxHeight = Math.max(maxHeight, item.height) widthSum += item.width }) return new LineItem({ x: textItems[0].x, y: textItems[0].y, height: maxHeight, width: widthSum, words: words, parsedElements: new ParsedElements({ footnoteLinks: wordStream.footnoteLinks, footnotes: wordStream.footnotes, containLinks: wordStream.containLinks, formattedWords: wordStream.formattedWords, }), }) } } class WordDetectionStream extends StashingStream { constructor (fontToFormats) { super() this.fontToFormats = fontToFormats this.footnoteLinks = [] this.footnotes = [] this.formattedWords = 0 this.containLinks = false this.stashedNumber = false } shouldStash (item) { // eslint-disable-line no-unused-vars if (!this.firstY) { this.firstY = item.y } this.currentItem = item return true } onPushOnStash (item) { // eslint-disable-line no-unused-vars this.stashedNumber = isNumber(item.text.trim()) } doMatchesStash (lastItem, item) { const lastItemFormat = this.fontToFormats.get(lastItem.font) const itemFormat = this.fontToFormats.get(item.font) if (lastItemFormat !== itemFormat) { return false } const itemIsANumber = isNumber(item.text.trim()) return this.stashedNumber === itemIsANumber } doFlushStash (stash, results) { if (this.stashedNumber) { const joinedNumber = stash.map(item => item.text) .join('') .trim() if (stash[0].y > this.firstY) { // footnote link results.push(new Word({ string: `${joinedNumber}`, type: WordType.FOOTNOTE_LINK, })) this.footnoteLinks.push(parseInt(joinedNumber)) } else if (this.currentItem && this.currentItem.y < stash[0].y) { // footnote results.push(new Word({ string: `${joinedNumber}`, type: WordType.FOOTNOTE, })) this.footnotes.push(joinedNumber) } else { this.copyStashItemsAsText(stash, results) } } else { this.copyStashItemsAsText(stash, results) } } copyStashItemsAsText (stash, results) { const format = this.fontToFormats.get(stash[0].font) results.push(...this.itemsToWords(stash, format)) } itemsToWords (items, formatName) { const combinedText = combineText(items) const words = combinedText.split(' ') const format = formatName ? WordFormat.enumValueOf(formatName) : null return words.filter(w => w.trim().length > 0).map(word => { var type = null if (word.startsWith('http:')) { this.containLinks = true type = WordType.LINK } else if (word.startsWith('www.')) { this.containLinks = true word = `http://${word}` type = WordType.LINK } if (format) { this.formattedWords++ } return new Word({ string: word, type, format }) }) } } function combineText (textItems) { var text = '' var lastItem textItems.forEach(textItem => { var textToAdd = textItem.text if (!text.endsWith(' ') && !textToAdd.startsWith(' ')) { if (lastItem) { const xDistance = textItem.x - lastItem.x - lastItem.width if (xDistance > 5) { text += ' ' } } else { if (isListItemCharacter(textItem.text)) { textToAdd += ' ' } } } text += textToAdd lastItem = textItem }) return text }