UNPKG

ppu-pdf

Version:

Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.

1 lines 7.82 kB
import*as pdfjs from"pdfjs-dist";import{BasePdfReaderCommon}from"../core/base-pdf-reader-common.js";import{CONSTANT,PDF_READER_DEFAULT_OPTIONS}from"../pdf.constant.js";export class PdfReaderLegacyWeb extends BasePdfReaderCommon{options;startIndex=1;constructor(options={}){super();this.options={...PDF_READER_DEFAULT_OPTIONS,...options,fonts:[]}}async open(data){let uint8=new Uint8Array(data);return pdfjs.getDocument({verbosity:+this.options.verbose,data:uint8}).promise}async renderAll(doc){let canvasMap=new Map;let numOfPages=doc.numPages;let renderPromises=[];for(let i=this.startIndex;i<=numOfPages;i++){let page=await doc.getPage(i);renderPromises.push(this.getCanvas(canvasMap,i,page))}await Promise.all(renderPromises);return canvasMap}async getTextsScanned(ocrService,canvasMap){await ocrService.initialize();let pages=new Map;let numOfPages=canvasMap.size;let ocrPromises=[];for(let i=this.startIndex;i<=numOfPages;i++){let canvas=canvasMap.get(i);if(canvas){ocrPromises.push(this.extractOcrTexts(pages,i,canvas,ocrService))}}await Promise.all(ocrPromises);return pages}async getCanvas(canvasMap,pageNum,page,normalizedWidth){let viewport=page.getViewport({scale:1});if(this.options.scale&&this.options.scale>1){viewport=page.getViewport({scale:this.options.scale})}else if(normalizedWidth){let normalizedScale=Math.floor(normalizedWidth/viewport.width);viewport=page.getViewport({scale:normalizedScale})}let width=Math.floor(viewport.width);let height=Math.floor(viewport.height);let canvas=document.createElement("canvas");canvas.width=width;canvas.height=height;let context=canvas.getContext("2d",{willReadFrequently:true});let renderContext={intent:"print",canvasContext:context,viewport};canvasMap.set(pageNum,canvas);return page.render(renderContext).promise}async getTexts(pdf){let pages=new Map;let numOfPages=pdf.numPages;let getTextContentPromises=[];for(let i=this.startIndex;i<=numOfPages;i++){let page=await pdf.getPage(i);getTextContentPromises.push(this.extractTexts(pages,i,page))}await Promise.all(getTextContentPromises);return pages}async extractTexts(linesMap,pageNum,page){const{height,transform}=page.getViewport({scale:1});let pdfToken=await page.getTextContent();let textsMapped=this.mapTokenToPdfWord(pdfToken.items,transform,pageNum);let textsSorted=this.options.simpleSortAlgorithm?this.sortTextContentSimple(textsMapped):this.sortTextContent(textsMapped);if(!this.options.raw){textsSorted=this.removeFakeBold(textsSorted)}let textsMerged=this.options.mergeCloseTextNeighbor?this.mergeTextContent(textsSorted):textsSorted;let textsFiltered=this.filterTextContent(textsMerged,height);let fullText=textsFiltered.map((word)=>word.text).join(" ");linesMap.set(pageNum,{words:textsFiltered,fullText,confidence:1,toon:this.getToonWords(textsFiltered,this.options.enableToon)})}async extractOcrTexts(linesMap,pageNum,canvas,ocrService){try{let ocrResult=await ocrService.recognize(canvas);let pdfWords=this.convertOcrToPdfWords(ocrResult,pageNum);let textsSorted=this.options.simpleSortAlgorithm?this.sortTextContentSimple(pdfWords):this.sortTextContent(pdfWords);if(!this.options.raw){textsSorted=this.removeFakeBold(textsSorted)}let textsMerged=this.options.mergeCloseTextNeighbor?this.mergeTextContent(textsSorted):textsSorted;let canvasHeight=canvas.height;let textsFiltered=this.filterTextContent(textsMerged,canvasHeight);let fullText=textsFiltered.map((word)=>word.text).join(" ");linesMap.set(pageNum,{words:textsFiltered,fullText,confidence:ocrResult.confidence,toon:this.getToonWords(textsFiltered,this.options.enableToon)})}catch(error){if(this.options.verbose){console.warn(`OCR failed for page ${pageNum}:`,error)}linesMap.set(pageNum,{words:[],fullText:"",confidence:0,toon:""})}}convertOcrToPdfWords(ocrResult,pageNum){if(!ocrResult?.lines||!Array.isArray(ocrResult.lines)){return[]}return ocrResult.lines.flatMap((line)=>{if(!Array.isArray(line))return[];return line.map((recognition)=>{const{x,y,width,height}=recognition.box;return{text:recognition.text,bbox:{x0:Math.round(x),y0:Math.round(y),x1:Math.round(x+width),y1:Math.round(y+height)},dimension:{width:Math.round(width),height:Math.round(height)},metadata:{writing:"",direction:"",font:{name:"",size:height,family:"",weight:"",style:""},hasEOL:false,pageNum}}})})}mapTokenToPdfWord(items,transform,pageNum){let pdfWords=[];for(let item of items){let token=item;const[_,__,___,____,x,y]=pdfjs.Util.transform(transform,token.transform);let scale=x/token.transform[4];let pdfWord={text:token.str,bbox:{x0:Math.round(x),y0:Math.round(y-token.height*scale),x1:Math.round(x+token.width*scale),y1:Math.round(y)},dimension:{width:Math.round(token.width),height:Math.round(token.height)},metadata:{writing:"",direction:token.dir,font:{name:token.fontName,size:Number(token.height.toFixed(4)),family:"",style:"",weight:""},hasEOL:token.hasEOL,pageNum}};pdfWords.push(pdfWord)}return pdfWords}mergeTextContent(texts){let result=[];let currentGroup=null;let UNORDERED_LIST=["•","-","◦","▪","▫"];for(let content of texts){const{text,dimension,metadata,bbox}=content;if(text===""&&(dimension.width===0||metadata.hasEOL))continue;if(text==" "&&metadata.font.size==0&&!metadata.hasEOL)continue;if(!currentGroup){currentGroup={...content};continue}let prevMiddleY=(currentGroup.bbox.y0+currentGroup.bbox.y1)/2;let isWithinXRange=bbox.x0<=currentGroup.bbox.x1+currentGroup.metadata.font.size;let isWithinYRange=content.bbox.y0<=prevMiddleY&&prevMiddleY<=bbox.y1;let hasSameFontSize=Math.abs(metadata.font.size-currentGroup.metadata.font.size)<0.01;let isLeadingGroupAnUnorderedList=isWithinYRange&&currentGroup.text.trim().length==1&&UNORDERED_LIST.includes(currentGroup.text.trim());if(isLeadingGroupAnUnorderedList||isWithinXRange&&isWithinYRange&&hasSameFontSize&&!currentGroup.metadata.hasEOL){currentGroup={text:currentGroup.text+(bbox.x0-currentGroup.bbox.x1<1?"":" ")+text,dimension:{width:bbox.x1-currentGroup.bbox.x0,height:Math.max(currentGroup.dimension.height,content.dimension.height)},bbox:{x0:currentGroup.bbox.x0,y0:Math.min(currentGroup.bbox.y0,bbox.y0),x1:bbox.x1,y1:Math.max(currentGroup.bbox.y1,bbox.y1)},metadata:{writing:"",direction:metadata.direction,font:{name:metadata.font.name,size:isLeadingGroupAnUnorderedList?metadata.font.size:currentGroup.metadata.font.size,family:"",style:"",weight:""},hasEOL:metadata.hasEOL,pageNum:metadata.pageNum}}}else{result.push(currentGroup);currentGroup={...content}}if(metadata.hasEOL){if(currentGroup){result.push(currentGroup)}currentGroup=null}}if(currentGroup){result.push(currentGroup)}return result}filterTextContent(texts,height){let HEADER_THRESHOLD=height*this.options.headerFromHeightPercentage;let FOOTER_THRESHOLD=height*this.options.footerFromHeightPercentage;return texts.filter((el)=>{let hasFontSize=el.metadata.font.size!==0;let isAfterHeader=el.bbox.y0>HEADER_THRESHOLD;let isBeforeFooter=el.bbox.y0<FOOTER_THRESHOLD;return hasFontSize&&(!this.options.excludeHeader||isAfterHeader)&&(!this.options.excludeFooter||isBeforeFooter)}).map((el,id)=>({...el,id,text:!this.options.raw?this.normalizedText(el.text):el.text}))}getLinesFromTexts(pageTexts){return this.getLinesFromTextsCommon(pageTexts,this.startIndex)}getLinesFromTextsInToon(pageTexts){return this.getLinesFromTextsInToonCommon(pageTexts,this.startIndex)}getCompactLinesFromTexts(pageTexts,algorithm="middleY"){return this.getCompactLinesFromTextsCommon(pageTexts,algorithm,this.startIndex)}isScanned(pageTexts,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){return this.isScannedCommon(pageTexts,options,this.startIndex)}isPageScanned(pageText,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){return this.isPageScannedCommon(pageText,options)}async destroy(pdf){await pdf.destroy()}}