@rpidanny/pdf2md
Version:
A PDF to Markdown Converter
63 lines (58 loc) • 2.11 kB
JavaScript
// @flow
const ToLineItemTransformation = require('../ToLineItemTransformation')
const ParseResult = require('../../ParseResult')
const LineItem = require('../../LineItem')
const Word = require('../../Word')
const { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } = require('../../Annotation')
const BlockType = require('../../markdown/BlockType')
const { isListItemCharacter, isNumberedListItem } = require('../../../util/string-functions')
// Detect items starting with -, •, etc...
module.exports = class DetectListItems extends ToLineItemTransformation {
constructor () {
super('Detect List Items')
}
transform (parseResult /*: ParseResult */) /*: ParseResult */ {
var foundListItems = 0
var foundNumberedItems = 0
parseResult.pages.forEach(page => {
const newItems = []
page.items.forEach(item => {
newItems.push(item)
if (!item.type) {
var text = item.text()
if (isListItemCharacter(item.words[0].string)) {
foundListItems++
if (item.words[0].string === '-') {
item.annotation = DETECTED_ANNOTATION
item.type = BlockType.LIST
} else {
item.annotation = REMOVED_ANNOTATION
const newWords = item.words.map(word => new Word({
...word,
}))
newWords[0].string = '-'
newItems.push(new LineItem({
...item,
words: newWords,
annotation: ADDED_ANNOTATION,
type: BlockType.LIST,
}))
}
} else if (isNumberedListItem(text)) { // TODO check that starts with 1 (kala chakra)
foundNumberedItems++
item.annotation = DETECTED_ANNOTATION
item.type = BlockType.LIST
}
}
})
page.items = newItems
})
return new ParseResult({
...parseResult,
messages: [
'Detected ' + foundListItems + ' plain list items.',
'Detected ' + foundNumberedItems + ' numbered list items.',
],
})
}
}