UNPKG

ppu-pdf

Version:

Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.

1 lines 5.59 kB
export class PdfReaderCommon{async saveCanvasToPng(canvas,filename,foldername){return new Promise((res,rej)=>{try{let folderPath=join(process.cwd(),foldername);if(!existsSync(folderPath)){mkdirSync(folderPath,{recursive:true})}let newCanvas=createCanvas(canvas.width,canvas.height);let ctx=newCanvas.getContext("2d");ctx.drawImage(canvas,0,0);let filePath=join(folderPath,filename);let out=createWriteStream(filePath);let buffer=newCanvas.toBuffer("image/png");out.write(buffer,(err)=>{if(err){rej(err)}else{res()}})}catch(error){rej(error)}})}async dumpCanvasMapCommon(canvasMap,filename,foldername="out",startIndex=0){for(let i=startIndex;i<canvasMap.size+startIndex;i++){let canvas=canvasMap.get(i);if(canvas){await this.saveCanvasToPng(canvas,`${filename}-${i}.png`,foldername)}}}sortTextContent(texts){return texts.sort((a,b)=>{let heightA=Math.abs(a.bbox.y1-a.bbox.y0);let heightB=Math.abs(b.bbox.y1-b.bbox.y0);let avgHeight=(heightA+heightB)/2;let threshold=avgHeight*0.5;let verticalDiff=Math.abs(a.bbox.y0-b.bbox.y0);if(verticalDiff<=threshold){return a.bbox.x0-b.bbox.x0}return a.bbox.y0-b.bbox.y0})}sortTextContentSimple(texts){return texts.sort((a,b)=>a.bbox.y0-b.bbox.y0||a.bbox.x0-b.bbox.x0)}getLinesFromTextsCommon(pageTexts,startIndex=0){let pageLines=new Map;let numOfPages=pageTexts.size;for(let i=startIndex;i<numOfPages+startIndex;i++){let pdfText=pageTexts.get(i);let lines=[];if(pdfText){lines=this.getLines(pdfText.words)}pageLines.set(i,lines)}return pageLines}getLines(words=[]){let lineGroups=[];for(let word of words){let appended=false;for(let line of lineGroups){let currentY0=1/0;let currentY1=-1/0;for(let w of line){currentY0=Math.min(currentY0,w.bbox.y0);currentY1=Math.max(currentY1,w.bbox.y1)}let midY=(currentY0+currentY1)/2;if(word.bbox.y0<=midY&&word.bbox.y1>=midY){line.push(word);appended=true;break}}if(!appended){lineGroups.push([word])}}return this.mergeLines(lineGroups)}mergeLines(lines){return lines.map((lineWords)=>{let x0=1/0;let y0=1/0;let x1=-1/0;let y1=-1/0;for(let word of lineWords){x0=Math.min(x0,word.bbox.x0);y0=Math.min(y0,word.bbox.y0);x1=Math.max(x1,word.bbox.x1);y1=Math.max(y1,word.bbox.y1)}lineWords.sort((a,b)=>a.bbox.x0-b.bbox.x0);let averageFontSize=lineWords.reduce((sum,word)=>sum+word.metadata.font.size,0)/lineWords.length;let dimension={width:x1-x0,height:y1-y0};return{bbox:{x0,y0,x1,y1},averageFontSize,dimension,words:lineWords,text:lineWords.map((word)=>word.text).join(" ")}})}getCompactLines(words=[]){let lineGroups=[];for(let word of words){let appended=false;for(let line of lineGroups){let currentY0=1/0;let currentY1=-1/0;for(let w of line){currentY0=Math.min(currentY0,w.bbox.y0);currentY1=Math.max(currentY1,w.bbox.y1)}let midY=(currentY0+currentY1)/2;if(word.bbox.y0<=midY&&word.bbox.y1>=midY){line.push(word);appended=true;break}}if(!appended){lineGroups.push([word])}}return this.mergeCompactLines(lineGroups)}mergeCompactLines(lines){let mergedLines=lines.map((lineWords)=>{let x0=1/0;let y0=1/0;let x1=-1/0;let y1=-1/0;for(let word of lineWords){x0=Math.min(x0,word.bbox.x0);y0=Math.min(y0,word.bbox.y0);x1=Math.max(x1,word.bbox.x1);y1=Math.max(y1,word.bbox.y1)}lineWords.sort((a,b)=>a.bbox.x0-b.bbox.x0);return{bbox:{x0,y0,x1,y1},words:lineWords.map((word)=>({text:word.text,bbox:word.bbox})),text:lineWords.map((word)=>word.text).join(" ")}});return mergedLines}getCompactLinesOldAlgorithm(words=[]){let lines=[];for(let word of words){let line=lines.find((l)=>Math.abs(l[0].bbox.y0-word.bbox.y0)<=5);if(line){line.push(word)}else{lines.push([word])}}let linesMerged=this.mergeCompactLinesOldAlgorithm(lines);return linesMerged}mergeCompactLinesOldAlgorithm(lines){let mergedLines=lines.map((line)=>{let x0=1/0;let y0=1/0;let x1=0;let y1=0;let words=[];line=line.sort((a,b)=>a.bbox.x0-b.bbox.x0);for(let word of line){x0=Math.min(x0,word.bbox.x0);y0=Math.min(y0,word.bbox.y0);x1=Math.max(x1,word.bbox.x1);y1=Math.max(y1,word.bbox.y1);words.push(word)}return{bbox:{x0,y0,x1,y1},words,text:words.map((word)=>word.text).join(" ")}});return mergedLines}getCompactLinesFromTextsCommon(pageTexts,algorithm="middleY",startIndex=0){let pageLines=new Map;let numOfPages=pageTexts.size;for(let i=startIndex;i<numOfPages+startIndex;i++){let pdfText=pageTexts.get(i);let lines=[];if(pdfText){let mappedCompactWords=this.mapWordsToCompactWords(pdfText.words);if(algorithm=="y0"){lines=this.getCompactLinesOldAlgorithm(mappedCompactWords)}else{lines=this.getCompactLines(mappedCompactWords)}}pageLines.set(i,lines)}return pageLines}mapWordsToCompactWords(words=[]){return words.map((word)=>({text:word.text,bbox:word.bbox}))}isScannedCommon(pageTexts,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD},startIndex=0){let totalWords=0;let fullText="";let totalPages=pageTexts.size;for(let i=startIndex;i<totalPages+startIndex;i++){let page=pageTexts.get(i);if(page){let texts=page.words.map((w)=>w.text).join(" ");fullText+=texts+" ";totalWords+=texts.split(/\s+/).filter((word)=>word.length>0).length}}let averageWordsPerPage=totalWords/totalPages;let isWordsBelowThreshold=averageWordsPerPage<options.wordsPerPage;let isTextLengthBelowThreshold=fullText.length<options.textLength;return isWordsBelowThreshold||isTextLengthBelowThreshold}normalizedText(str){let spacedLetterPattern=/^([A-Z]\s)+[A-Z]$/;str=str.replace(/ +/g," ");if(spacedLetterPattern.test(str)){return str.replace(/\s/g,"")}return str?.trim()}}import{createCanvas}from"@napi-rs/canvas";import{createWriteStream,existsSync,mkdirSync}from"fs";import{join}from"path";import{CONSTANT}from"./pdf.constant";