@rpidanny/pdf2md
Version:
A PDF to Markdown Converter
352 lines (323 loc) • 11.6 kB
JavaScript
// @flow
const ToLineItemTransformation = require('../ToLineItemTransformation')
const ParseResult = require('../../ParseResult')
const LineItem = require('../../LineItem')
const Word = require('../../Word')
const HeadlineFinder = require('../../HeadlineFinder')
const { REMOVED_ANNOTATION, ADDED_ANNOTATION } = require('../../Annotation')
const BlockType = require('../../markdown/BlockType')
const { headlineByLevel } = require('../../markdown/BlockType')
const { isDigit, isNumber, wordMatch, hasOnly } = require('../../../util/string-functions')
// Detect table of contents pages plus linked headlines
module.exports = class DetectTOC extends ToLineItemTransformation {
constructor () {
super('Detect TOC')
}
transform (parseResult /*: ParseResult */) /*: ParseResult */ {
const tocPages = []
const maxPagesToEvaluate = Math.min(20, parseResult.pages.length)
const linkLeveler = new LinkLeveler()
var tocLinks = []
var lastTocPage
var headlineItem
parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
var lineItemsWithDigits = 0
const unknownLines = new Set()
const pageTocLinks = []
var lastWordsWithoutNumber
var lastLine
// find lines with words containing only "." ...
const tocLines = page.items.filter(line => line.words.includes(word => hasOnly(word.string, '.')))
// ... and ending with a number per page
tocLines.forEach(line => {
var words = line.words.filter(word => !hasOnly(word.string, '.'))
const digits = []
while (words.length > 0 && isNumber(words[words.length - 1].string)) {
const lastWord = words.pop()
digits.unshift(lastWord.string)
}
if (digits.length === 0 && words.length > 0) {
const lastWord = words[words.length - 1]
while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) {
digits.unshift(lastWord.string.charAt(lastWord.string.length - 1))
lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1)
}
}
var endsWithDigit = digits.length > 0
if (endsWithDigit) {
endsWithDigit = true
if (lastWordsWithoutNumber) { // 2-line item ?
words.push(...lastWordsWithoutNumber)
lastWordsWithoutNumber = null
}
pageTocLinks.push(new TocLink({
pageNumber: parseInt(digits.join('')),
lineItem: new LineItem({ ...line, words }),
}))
lineItemsWithDigits++
} else {
if (!headlineItem) {
headlineItem = line
} else {
if (lastWordsWithoutNumber) {
unknownLines.add(lastLine)
}
lastWordsWithoutNumber = words
lastLine = line
}
}
})
// page has been processed
if (lineItemsWithDigits * 100 / page.items.length > 75) {
tocPages.push(page.index + 1)
lastTocPage = page
linkLeveler.levelPageItems(pageTocLinks)
tocLinks.push(...pageTocLinks)
const newBlocks = []
page.items.forEach((line) => {
if (!unknownLines.has(line)) {
line.annotation = REMOVED_ANNOTATION
}
newBlocks.push(line)
if (line === headlineItem) {
newBlocks.push(new LineItem({
...line,
type: BlockType.H2,
annotation: ADDED_ANNOTATION,
}))
}
})
page.items = newBlocks
} else {
headlineItem = null
}
})
// all pages have been processed
var foundHeadlines = tocLinks.length
const notFoundHeadlines = []
const foundBySize = []
const headlineTypeToHeightRange = {} // H1={min:23, max:25}
if (tocPages.length > 0) {
// Add TOC items
tocLinks.forEach(tocLink => {
lastTocPage.items.push(new LineItem({
words: [new Word({
string: ' '.repeat(tocLink.level * 3) + '-',
})].concat(tocLink.lineItem.words),
type: BlockType.TOC,
annotation: ADDED_ANNOTATION,
}))
})
// Add linked headers
const pageMapping = detectPageMappingNumber(parseResult.pages.filter(page => page.index > lastTocPage.index), tocLinks)
tocLinks.forEach(tocLink => {
var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping]
var foundHealineItems
if (linkedPage) {
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text())
if (!foundHealineItems) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1]
if (linkedPage) {
foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text())
}
}
}
if (foundHealineItems) {
addHeadlineItems(linkedPage, tocLink, foundHealineItems, headlineTypeToHeightRange)
} else {
notFoundHeadlines.push(tocLink)
}
})
// Try to find linked headers by height
var fromPage = lastTocPage.index + 2
var lastNotFound = []
const rollupLastNotFound = (currentPageNumber) => {
if (lastNotFound.length > 0) {
lastNotFound.forEach(notFoundTocLink => {
const headlineType = headlineByLevel(notFoundTocLink.level + 2)
const heightRange = headlineTypeToHeightRange[headlineType.name]
if (heightRange) {
const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber)
if (lineIndex > -1) {
const page = parseResult.pages[pageIndex]
page.items[lineIndex].annotation = REMOVED_ANNOTATION
page.items.splice(lineIndex + 1, 0, new LineItem({
...notFoundTocLink.lineItem,
type: headlineType,
annotation: ADDED_ANNOTATION,
}))
foundBySize.push(notFoundTocLink)
}
}
})
lastNotFound = []
}
}
if (notFoundHeadlines.length > 0) {
tocLinks.forEach(tocLink => {
if (notFoundHeadlines.includes(tocLink)) {
lastNotFound.push(tocLink)
} else {
rollupLastNotFound(tocLink.pageNumber)
fromPage = tocLink.pageNumber
}
})
if (lastNotFound.length > 0) {
rollupLastNotFound(parseResult.pages.length)
}
}
}
const messages = []
messages.push('Detected ' + tocPages.length + ' table of content pages')
if (tocPages.length > 0) {
messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange))
messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines)
}
if (notFoundHeadlines.length > 0) {
messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text()))
messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber))
}
return new ParseResult({
...parseResult,
globals: {
...parseResult.globals,
tocPages,
headlineTypeToHeightRange,
},
messages,
})
}
}
// Find out how the TOC page link actualy translates to the page.index
function detectPageMappingNumber (pages, tocLinks) {
for (var tocLink of tocLinks) {
const page = findPageWithHeadline(pages, tocLink.lineItem.text())
if (page) {
return page.index - tocLink.pageNumber
}
}
return null
}
function findPageWithHeadline (pages, headline) {
for (var page of pages) {
if (findHeadlineItems(page, headline)) {
return page
}
}
return null
}
function findHeadlineItems (page, headline) {
const headlineFinder = new HeadlineFinder({ headline })
var lineIndex = 0
for (var line of page.items) {
const headlineItems = headlineFinder.consume(line)
if (headlineItems) {
return { lineIndex, headlineItems }
}
lineIndex++
}
return null
}
function addHeadlineItems (page, tocLink, foundItems, headlineTypeToHeightRange) {
foundItems.headlineItems.forEach(item => (item.annotation = REMOVED_ANNOTATION))
const headlineType = headlineByLevel(tocLink.level + 2)
const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0)
page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({
...foundItems.headlineItems[0],
words: tocLink.lineItem.words,
height: headlineHeight,
type: headlineType,
annotation: ADDED_ANNOTATION,
}))
var range = headlineTypeToHeightRange[headlineType.name]
if (range) {
range.min = Math.min(range.min, headlineHeight)
range.max = Math.max(range.max, headlineHeight)
} else {
range = {
min: headlineHeight,
max: headlineHeight,
}
headlineTypeToHeightRange[headlineType.name] = range
}
}
function findPageAndLineFromHeadline (pages, tocLink, heightRange, fromPage, toPage) {
const linkText = tocLink.lineItem.text().toUpperCase()
for (var i = fromPage; i <= toPage; i++) {
const page = pages[i - 1]
if (page) {
const lineIndex = page.items.findIndex(line => {
if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
const match = wordMatch(linkText, line.text())
return match >= 0.5
}
return false
})
if (lineIndex > -1) return [i - 1, lineIndex]
}
}
return [-1, -1]
}
class LinkLeveler {
constructor () {
this.levelByMethod = null
this.uniqueFonts = []
}
levelPageItems (tocLinks /*: TocLink[] */) {
if (!this.levelByMethod) {
const uniqueX = this.calculateUniqueX(tocLinks)
if (uniqueX.length > 1) {
this.levelByMethod = this.levelByXDiff
} else {
const uniqueFonts = this.calculateUniqueFonts(tocLinks)
if (uniqueFonts.length > 1) {
this.uniqueFonts = uniqueFonts
this.levelByMethod = this.levelByFont
} else {
this.levelByMethod = this.levelToZero
}
}
}
this.levelByMethod(tocLinks)
}
levelByXDiff (tocLinks) {
const uniqueX = this.calculateUniqueX(tocLinks)
tocLinks.forEach(link => {
link.level = uniqueX.indexOf(link.lineItem.x)
})
}
levelByFont (tocLinks) {
tocLinks.forEach(link => {
link.level = this.uniqueFonts.indexOf(link.lineItem.font)
})
}
levelToZero (tocLinks) {
tocLinks.forEach(link => {
link.level = 0
})
}
calculateUniqueX (tocLinks) {
var uniqueX = tocLinks.reduce(function (uniquesArray, link) {
if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x)
return uniquesArray
}, [])
uniqueX.sort((a, b) => {
return a - b
})
return uniqueX
}
calculateUniqueFonts (tocLinks) {
var uniqueFont = tocLinks.reduce(function (uniquesArray, link) {
if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font)
return uniquesArray
}, [])
return uniqueFont
}
}
class TocLink {
constructor (options) {
this.lineItem = options.lineItem
this.pageNumber = options.pageNumber
this.level = 0
}
}