UNPKG

ppu-pdf

Version:

Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.

1 lines 6.02 kB
import"pdfjs-dist/build/pdf.worker.min.mjs";import"./pdfjs-workaround";import{createCanvas,GlobalFonts}from"@napi-rs/canvas";import{existsSync,readFileSync}from"fs";import*as pdfjs from"pdfjs-dist/legacy/build/pdf.mjs";import{NodeCanvasFactory}from"./canvas-factory";import{PdfReaderCommon}from"./pdf-reader-common";import{CONSTANT,PDF_READER_DEFAULT_OPTIONS}from"./pdf.constant";export class PdfReaderLegacy extends PdfReaderCommon{options;startIndex=1;constructor(options={}){super();this.options={...PDF_READER_DEFAULT_OPTIONS,...options};if(this.options.fonts.length){for(let f of this.options.fonts){if(!existsSync(f.path))throw new Error(`Invalid font path: [${f.name}] ${f}`);GlobalFonts.registerFromPath(f.path,f.name)}}}async open(filename){let data;if(typeof filename=="string"){data=new Uint8Array(readFileSync(filename))}else{data=new Uint8Array(filename)}return pdfjs.getDocument({verbosity:+this.options.verbose,CanvasFactory:NodeCanvasFactory,data}).promise}async renderAll(doc){let canvasMap=new Map;let numOfPages=doc.numPages;let renderPromises=[];for(let i=this.startIndex;i<=numOfPages;i++){let page=await doc.getPage(i);renderPromises.push(this.getCanvas(canvasMap,i,page))}await Promise.all(renderPromises);return canvasMap}async getCanvas(canvasMap,pageNum,page,normalizedWidth){let viewport=page.getViewport({scale:1});if(this.options.scale&&this.options.scale>1){viewport=page.getViewport({scale:this.options.scale})}else if(normalizedWidth){let normalizedScale=Math.floor(normalizedWidth/viewport.width);viewport=page.getViewport({scale:normalizedScale})}let width=Math.floor(viewport.width);let height=Math.floor(viewport.height);let canvas=createCanvas(width,height);let context=canvas.getContext("2d");let renderContext={intent:"print",canvasContext:context,viewport};canvasMap.set(pageNum,canvas);return page.render(renderContext).promise}async getTexts(pdf){let pages=new Map;let numOfPages=pdf.numPages;let getTextContentPromises=[];for(let i=this.startIndex;i<=numOfPages;i++){let page=await pdf.getPage(i);getTextContentPromises.push(this.extractTexts(pages,i,page))}await Promise.all(getTextContentPromises);return pages}async extractTexts(linesMap,pageNum,page){const{height,transform}=page.getViewport({scale:1});let pdfToken=await page.getTextContent();let textsMapped=this.mapTokenToPdfWord(pdfToken.items,transform,pageNum);let textsSorted=this.options.simpleSortAlgorithm?this.sortTextContentSimple(textsMapped):this.sortTextContent(textsMapped);let textsMerged=this.options.mergeCloseTextNeighbor?this.mergeTextContent(textsSorted):textsSorted;let textsFiltered=this.filterTextContent(textsMerged,height);linesMap.set(pageNum,{words:textsFiltered})}mapTokenToPdfWord(items,transform,pageNum){let pdfWords=[];for(let item of items){let token=item;const[_,__,___,____,x,y]=pdfjs.Util.transform(transform,token.transform);let scale=x/token.transform[4];let pdfWord={text:token.str,bbox:{x0:x,y0:y-token.height*scale,x1:x+token.width*scale,y1:y},dimension:{width:token.width,height:token.height},metadata:{writing:"",direction:token.dir,font:{name:token.fontName,size:Number(token.height.toFixed(4)),family:"",style:"",weight:""},hasEOL:token.hasEOL,pageNum}};pdfWords.push(pdfWord)}return pdfWords}mergeTextContent(texts){let result=[];let currentGroup=null;let UNORDERED_LIST=["•","-","◦","▪","▫"];for(let content of texts){const{text,dimension,metadata,bbox}=content;if(text===""&&(dimension.width===0||metadata.hasEOL))continue;if(text==" "&&metadata.font.size==0&&!metadata.hasEOL)continue;if(!currentGroup){currentGroup={...content};continue}let prevMiddleY=(currentGroup.bbox.y0+currentGroup.bbox.y1)/2;let isWithinXRange=bbox.x0<=currentGroup.bbox.x1+currentGroup.metadata.font.size;let isWithinYRange=content.bbox.y0<=prevMiddleY&&prevMiddleY<=bbox.y1;let hasSameFontSize=Math.abs(metadata.font.size-currentGroup.metadata.font.size)<0.01;let isLeadingGroupAnUnorderedList=isWithinYRange&&currentGroup.text.trim().length==1&&UNORDERED_LIST.includes(currentGroup.text.trim());if(isLeadingGroupAnUnorderedList||isWithinXRange&&isWithinYRange&&hasSameFontSize&&!currentGroup.metadata.hasEOL){currentGroup={text:currentGroup.text+(bbox.x0-currentGroup.bbox.x1<1?"":" ")+text,dimension:{width:bbox.x1-currentGroup.bbox.x0,height:Math.max(currentGroup.dimension.height,content.dimension.height)},bbox:{x0:currentGroup.bbox.x0,y0:Math.min(currentGroup.bbox.y0,bbox.y0),x1:bbox.x1,y1:Math.max(currentGroup.bbox.y1,bbox.y1)},metadata:{writing:"",direction:metadata.direction,font:{name:metadata.font.name,size:isLeadingGroupAnUnorderedList?metadata.font.size:currentGroup.metadata.font.size,family:"",style:"",weight:""},hasEOL:metadata.hasEOL,pageNum:metadata.pageNum}}}else{result.push(currentGroup);currentGroup={...content}}if(metadata.hasEOL){if(currentGroup){result.push(currentGroup)}currentGroup=null}}if(currentGroup){result.push(currentGroup)}return result}filterTextContent(texts,height){let HEADER_THRESHOLD=height*this.options.headerFromHeightPercentage;let FOOTER_THRESHOLD=height*this.options.footerFromHeightPercentage;return texts.filter((el)=>{let hasFontSize=el.metadata.font.size!==0;let isAfterHeader=el.bbox.y0>HEADER_THRESHOLD;let isBeforeFooter=el.bbox.y0<FOOTER_THRESHOLD;return hasFontSize&&(!this.options.excludeHeader||isAfterHeader)&&(!this.options.excludeFooter||isBeforeFooter)}).map((el,id)=>({...el,id,text:!this.options.raw?this.normalizedText(el.text):el.text}))}getLinesFromTexts(pageTexts){return this.getLinesFromTextsCommon(pageTexts,this.startIndex)}getCompactLinesFromTexts(pageTexts,algorithm="middleY"){return this.getCompactLinesFromTextsCommon(pageTexts,algorithm,this.startIndex)}isScanned(pageTexts,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){return this.isScannedCommon(pageTexts,options,this.startIndex)}async dumpCanvasMap(canvasMap,filename,foldername="out"){this.dumpCanvasMapCommon(canvasMap,filename,foldername,this.startIndex)}async destroy(pdf){await pdf.destroy()}}