parse-pdf
Version:
Get text content and metadata about pdf
28 lines (20 loc) • 614 B
JavaScript
const pdfjs = require('pdfjs-dist')
async function getPageText (pageNum, doc) {
const page = await doc.getPage(pageNum)
const textContent = await page.getTextContent()
return textContent.items.reduce((a, v) => a + v.str, '')
}
module.exports = async (contentBuffer, { customPdfjs } = {}) => {
let pdfjsAPI = pdfjs
if (customPdfjs) {
pdfjsAPI = customPdfjs
}
const doc = await pdfjsAPI.getDocument(contentBuffer).promise
const result = { pages: [] }
for (let i = 1; i < doc.numPages + 1; i++) {
result.pages.push({
text: await getPageText(i, doc)
})
}
return result
}