UNPKG

ppu-pdf

Version:

Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.

3 lines 6.19 kB
export class BasePdfReaderCommon{sortTextContent(texts){return texts.sort((a,b)=>{let heightA=Math.abs(a.bbox.y1-a.bbox.y0);let heightB=Math.abs(b.bbox.y1-b.bbox.y0);let avgHeight=(heightA+heightB)/2;let threshold=avgHeight*0.5;let verticalDiff=Math.abs(a.bbox.y0-b.bbox.y0);if(verticalDiff<=threshold){return a.bbox.x0-b.bbox.x0}return a.bbox.y0-b.bbox.y0})}sortTextContentSimple(texts){return texts.sort((a,b)=>a.bbox.y0-b.bbox.y0||a.bbox.x0-b.bbox.x0)}removeFakeBold(texts){let seen=new Set;let result=[];for(let i=0,len=texts.length;i<len;i++){let w=texts[i];let key=w.text+"|"+w.bbox.x0+","+w.bbox.y0+","+w.bbox.x1+","+w.bbox.y1+w.metadata.pageNum;if(!seen.has(key)){seen.add(key);result.push(w)}}return result}getLinesFromTextsCommon(pageTexts,startIndex=0){let pageLines=new Map;let numOfPages=pageTexts.size;for(let i=startIndex;i<numOfPages+startIndex;i++){let pdfText=pageTexts.get(i);let lines=[];if(pdfText){lines=this.getLines(pdfText.words)}pageLines.set(i,lines)}return pageLines}getLines(words=[]){let lineGroups=[];for(let word of words){let appended=false;for(let line of lineGroups){let currentY0=1/0;let currentY1=-1/0;for(let w of line){currentY0=Math.min(currentY0,w.bbox.y0);currentY1=Math.max(currentY1,w.bbox.y1)}let midY=(currentY0+currentY1)/2;if(word.bbox.y0<=midY&&word.bbox.y1>=midY){line.push(word);appended=true;break}}if(!appended){lineGroups.push([word])}}return this.mergeLines(lineGroups)}mergeLines(lines){return lines.map((lineWords)=>{let x0=1/0;let y0=1/0;let x1=-1/0;let y1=-1/0;for(let word of lineWords){x0=Math.min(x0,word.bbox.x0);y0=Math.min(y0,word.bbox.y0);x1=Math.max(x1,word.bbox.x1);y1=Math.max(y1,word.bbox.y1)}lineWords.sort((a,b)=>a.bbox.x0-b.bbox.x0);let averageFontSize=lineWords.reduce((sum,word)=>sum+word.metadata.font.size,0)/lineWords.length;let dimension={width:x1-x0,height:y1-y0};return{bbox:{x0,y0,x1,y1},averageFontSize,dimension,words:lineWords,text:lineWords.map((word)=>word.text).join(" ")}})}getCompactLines(words=[]){let lineGroups=[];for(let word of words){let appended=false;for(let line of lineGroups){let currentY0=1/0;let currentY1=-1/0;for(let w of line){currentY0=Math.min(currentY0,w.bbox.y0);currentY1=Math.max(currentY1,w.bbox.y1)}let midY=(currentY0+currentY1)/2;if(word.bbox.y0<=midY&&word.bbox.y1>=midY){line.push(word);appended=true;break}}if(!appended){lineGroups.push([word])}}return this.mergeCompactLines(lineGroups)}mergeCompactLines(lines){let mergedLines=lines.map((lineWords)=>{let x0=1/0;let y0=1/0;let x1=-1/0;let y1=-1/0;for(let word of lineWords){x0=Math.min(x0,word.bbox.x0);y0=Math.min(y0,word.bbox.y0);x1=Math.max(x1,word.bbox.x1);y1=Math.max(y1,word.bbox.y1)}lineWords.sort((a,b)=>a.bbox.x0-b.bbox.x0);return{bbox:{x0,y0,x1,y1},words:lineWords.map((word)=>({text:word.text,bbox:word.bbox})),text:lineWords.map((word)=>word.text).join(" ")}});return mergedLines}getCompactLinesOldAlgorithm(words=[]){let lines=[];for(let word of words){let line=lines.find((l)=>Math.abs(l[0].bbox.y0-word.bbox.y0)<=5);if(line){line.push(word)}else{lines.push([word])}}let linesMerged=this.mergeCompactLinesOldAlgorithm(lines);return linesMerged}mergeCompactLinesOldAlgorithm(lines){let mergedLines=lines.map((line)=>{let x0=1/0;let y0=1/0;let x1=0;let y1=0;let words=[];line=line.sort((a,b)=>a.bbox.x0-b.bbox.x0);for(let word of line){x0=Math.min(x0,word.bbox.x0);y0=Math.min(y0,word.bbox.y0);x1=Math.max(x1,word.bbox.x1);y1=Math.max(y1,word.bbox.y1);words.push(word)}return{bbox:{x0,y0,x1,y1},words,text:words.map((word)=>word.text).join(" ")}});return mergedLines}getCompactLinesFromTextsCommon(pageTexts,algorithm="middleY",startIndex=0){let pageLines=new Map;let numOfPages=pageTexts.size;for(let i=startIndex;i<numOfPages+startIndex;i++){let pdfText=pageTexts.get(i);let lines=[];if(pdfText){let mappedCompactWords=this.mapWordsToCompactWords(pdfText.words);if(algorithm=="y0"){lines=this.getCompactLinesOldAlgorithm(mappedCompactWords)}else{lines=this.getCompactLines(mappedCompactWords)}}pageLines.set(i,lines)}return pageLines}mapWordsToCompactWords(words=[]){return words.map((word)=>({text:word.text,bbox:word.bbox}))}isScannedCommon(pageTexts,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD},startIndex=0){let totalWords=0;let fullText="";let totalPages=pageTexts.size;for(let i=startIndex;i<totalPages+startIndex;i++){let page=pageTexts.get(i);if(page){let texts=page.words.map((w)=>w.text).join(" ");fullText+=texts+" ";totalWords+=texts.split(/\s+/).filter((word)=>word.length>0).length}}let averageWordsPerPage=totalWords/totalPages;let isWordsBelowThreshold=averageWordsPerPage<options.wordsPerPage;let isTextLengthBelowThreshold=fullText.length<options.textLength;return isWordsBelowThreshold||isTextLengthBelowThreshold}normalizedText(str){let spacedLetterPattern=/^([A-Z]\s)+[A-Z]$/;str=str.replace(/ +/g," ");if(spacedLetterPattern.test(str)){str=str.replace(/\s/g,"")}return str?.trim()}isPageScannedCommon(pageText,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){let normalizedText=this.normalizedText(pageText);let wordCount=normalizedText.split(/\s+/).filter((word)=>word.length>0).length;let isWordsBelowThreshold=wordCount<options.wordsPerPage;let isTextLengthBelowThreshold=normalizedText.length<options.textLength;return isWordsBelowThreshold||isTextLengthBelowThreshold}getToonWords(pdfWords,enableToon){if(!enableToon)return"";let simplifyWords=pdfWords.map((word)=>({text:word.text,bbox:[word.bbox.x0,word.bbox.y0,word.bbox.x1,word.bbox.y1]}));return encode(simplifyWords)}getLinesFromTextsInToonCommon(pageTexts,startIndex=0){let pageLines="";let numOfPages=pageTexts.size;for(let i=startIndex;i<numOfPages+startIndex;i++){let pdfText=pageTexts.get(i);let lines={};if(pdfText){lines=this.getLines(pdfText.words).reduce((acc,word,index)=>{acc[`${index}`]=word.words.reduce((acc2,word2)=>{let bboxKey=`${word2.bbox.x0},${word2.bbox.y0},${word2.bbox.x1},${word2.bbox.y1}`;acc2[bboxKey]=word2.text;return acc2},{});return acc},{})}pageLines+=`# Page ${i} lines: ${encode(lines)} `}return pageLines}}import{encode}from"@toon-format/toon";import{CONSTANT}from"../pdf.constant.js";