z-pdf-parser
Version:
112 lines (95 loc) • 3.63 kB
JavaScript
const DEFAULT_OPTIONS = require("./contants");
const { endPage, startPage } = require("./contants");
var PDFJS = null;
/**
* Itarates over text content, check if text is on same row, if yes then concatenate it. If else then create new row.
* @param {Array} items Array of text content.
* @return {String} Text with formatted rows.
*/
function rowSplitter(textContent) {
let rowPosition,
text = "";
textContent.map((item) => {
if (rowPosition === item.transform[5] || !rowPosition) {
// Check if first text item if yes do not add whitespace before it.
!rowPosition ? (text += item.str) : (text += ` ${item.str}`);
} else {
text += `\n${item.str}`;
}
rowPosition = item.transform[5];
});
return text;
}
/**
* @param {Object} page PDFPageProxy Object.
* @return {Promise} Promise with rendered page.
*/
async function pageRenderer(page) {
let textContent = await page.getTextContent();
let formattedText = rowSplitter(textContent.items);
return formattedText;
}
/**
* @param {Buffer} dataBuffer File Buffer (.pdf).
* @param {Object} options Options { pageRange: true || false, startPage: number, endPage: number }.
* @param {Boolean} options.pageRange Default: false witch means parse all pages, if true then parsing pages are defined in startPage and endPage.
* @param {Number} options.startPage Starting page for parser. Default: 1.
* @param {Number} options.endPage End page for parser. Default: 1.
* @return {Promise} Promise with PDF text.
*/
async function zPDFParser(dataBuffer, options) {
let response = {
version: null,
metadata: null,
pageCount: 0,
info: null,
text: "",
};
if (typeof options === "undefined") options = DEFAULT_OPTIONS;
if (typeof options.version !== "string")
options.version = DEFAULT_OPTIONS.version;
if (typeof options.pageRange !== "boolean")
options.pageRange = DEFAULT_OPTIONS.pageRange;
if (typeof options.startPage !== "number")
options.startPage = DEFAULT_OPTIONS.startPage;
if (typeof options.endPage !== "number")
options.endPage = DEFAULT_OPTIONS.endPage;
PDFJS = PDFJS
? PDFJS
: require(`../pdfjs/pdfjs-dist/es5/${options.version}/pdf.js`);
PDFJS.disableWorker = true;
let pdfData = await PDFJS.getDocument(dataBuffer).promise;
let pageCount = pdfData.numPages;
let metaData = await pdfData.getMetadata().catch(function (err) {
return null;
});
response.info = metaData ? metaData.info : null;
response.metadata = metaData ? metaData.metadata : null;
let text = "";
if (options.endPage > pageCount)
return console.error(
`Document have ${pageCount} page. End page cannot be higer then that.`
);
if (options.pageRange) {
for (let index = options.startPage; index <= options.endPage; index++) {
let page = await pdfData.getPage(index);
let renderedPage = await pageRenderer(page);
index === options.startPage
? (text += renderedPage)
: (text += `\n${renderedPage}`);
}
response.pageCount =
parseInt(options.endPage) - parseInt(options.startPage) + 1;
} else {
for (let index = 1; index < pageCount; index++) {
let page = await pdfData.getPage(index);
let renderedPage = await pageRenderer(page);
index === 1 ? (text += renderedPage) : (text += `\n${renderedPage}`);
}
response.pageCount = pageCount;
}
response.version = PDFJS.version;
response.text = text;
return response;
}
module.exports = zPDFParser;