ppu-pdf
Version:
Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.
1 lines • 5.99 kB
JavaScript
import"./mupdf-workaround";import{createCanvas,GlobalFonts,ImageData}from"@napi-rs/canvas";import{existsSync,readFileSync}from"fs";import{PdfReaderCommon}from"./pdf-reader-common";import{CONSTANT,PDF_READER_DEFAULT_OPTIONS}from"./pdf.constant";let mupdf=await import("mupdf/mupdfjs");export class PdfReader extends PdfReaderCommon{options;startIndex=0;constructor(options={}){super();this.options={...PDF_READER_DEFAULT_OPTIONS,...options};if(this.options.fonts.length){for(let f of this.options.fonts){if(!existsSync(f.path))throw new Error(`Invalid font path: [${f.name}] ${f}`);GlobalFonts.registerFromPath(f.path,f.name)}}}open(filename){let data;if(typeof filename=="string"){data=new Uint8Array(readFileSync(filename))}else{data=new Uint8Array(filename)}return mupdf.PDFDocument.openDocument(data,"application/pdf")}async renderAll(doc,dpi=72){let canvasMap=new Map;let numOfPages=doc.countPages();let renderPromises=Array.from({length:numOfPages},(_,i)=>{let page=new mupdf.PDFPage(doc,i);return this.getCanvas(canvasMap,i,page,dpi)});await Promise.all(renderPromises);return canvasMap}async getCanvas(canvasMap,pageNum,page,dpi){let pageDimension=page.getBounds();let scaleFactor=mupdf.Matrix.scale(dpi/72,dpi/72);let bbox=mupdf.Rect.transform(pageDimension,scaleFactor);let pixmap=new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB,bbox,false);pixmap.clear(255);let device=new mupdf.DrawDevice(scaleFactor,pixmap);page.run(device,mupdf.Matrix.identity);device.close();page.destroy();let width=pixmap.getWidth();let height=pixmap.getHeight();let pixels3=new Uint8ClampedArray(pixmap.getPixels());let pixels4=new Uint8ClampedArray(width*height*4);for(let i=0,j=0;i<pixels3.length;i+=3,j+=4){pixels4[j]=pixels3[i];pixels4[j+1]=pixels3[i+1];pixels4[j+2]=pixels3[i+2];pixels4[j+3]=255}let imageData=new ImageData(pixels4,width,height);imageData.colorSpace="srgb";let canvas=createCanvas(pageDimension[2],pageDimension[3]);let context=canvas.getContext("2d");canvas.width=imageData.width;canvas.height=imageData.height;context.putImageData(imageData,0,0);canvasMap.set(pageNum,canvas)}async getTexts(doc){let pages=new Map;let numOfPages=doc.countPages();let getTextContentPromises=[];for(let i=this.startIndex;i<numOfPages;i++){let page=new mupdf.PDFPage(doc,i);getTextContentPromises.push(this.extractTexts(pages,i,page))}await Promise.all(getTextContentPromises);return pages}async extractTexts(linesMap,pageNum,page){const[,,,height]=page.getBounds();let docStructure=JSON.parse(page.toStructuredText("ignore-actualtext").asJSON());page.destroy();let textsMapped=this.mapStructureToPdfWord(docStructure,pageNum);let textsSorted=this.options.simpleSortAlgorithm?this.sortTextContentSimple(textsMapped):this.sortTextContent(textsMapped);let textsMerged=this.options.mergeCloseTextNeighbor?this.mergeTextContent(textsSorted):textsSorted;let textsFiltered=this.filterTextContent(textsMerged,height);linesMap.set(pageNum,{words:textsFiltered})}mapStructureToPdfWord(structure,pageNum){let pdfWords=[];let rawTexts=structure.blocks.map((el)=>el.lines).flat();for(let item of rawTexts){const{x,y,w,h}=item.bbox;let font=item.font;let pdfWord={text:item.text,bbox:{x0:x,y0:y,x1:x+w,y1:y+h},dimension:{width:w,height:h},metadata:{writing:item.wmode==0?"horizontal":"vertical",direction:"",font,hasEOL:false,pageNum}};pdfWords.push(pdfWord)}return pdfWords}mergeTextContent(texts){let result=[];let currentGroup=null;let UNORDERED_LIST=["•","-","◦","▪","▫"];for(let content of texts){const{text,dimension,metadata,bbox}=content;if(text===""&&dimension.width===0)continue;if(text==" "&&metadata.font.size==0)continue;if(!currentGroup){currentGroup={...content};continue}let prevMiddleY=(currentGroup.bbox.y0+currentGroup.bbox.y1)/2;let isWithinXRange=bbox.x0<=currentGroup.bbox.x1+currentGroup.metadata.font.size;let isWithinYRange=content.bbox.y0<=prevMiddleY&&prevMiddleY<=bbox.y1;let hasSameFontSize=Math.abs(metadata.font.size-currentGroup.metadata.font.size)<0.01;let isLeadingGroupAnUnorderedList=isWithinYRange&¤tGroup.text.trim().length==1&&UNORDERED_LIST.includes(currentGroup.text.trim());if(isLeadingGroupAnUnorderedList||isWithinXRange&&isWithinYRange&&hasSameFontSize){currentGroup={text:currentGroup.text+(bbox.x0-currentGroup.bbox.x1<1?"":" ")+text,dimension:{width:bbox.x1-currentGroup.bbox.x0,height:Math.max(currentGroup.dimension.height,content.dimension.height)},bbox:{x0:currentGroup.bbox.x0,y0:Math.min(currentGroup.bbox.y0,bbox.y0),x1:bbox.x1,y1:Math.max(currentGroup.bbox.y1,bbox.y1)},metadata:{writing:metadata.writing,direction:"",font:isLeadingGroupAnUnorderedList?metadata.font:currentGroup.metadata.font,hasEOL:false,pageNum:metadata.pageNum}}}else{result.push(currentGroup);currentGroup={...content}}}if(currentGroup){result.push(currentGroup)}return result}filterTextContent(texts,height){let HEADER_THRESHOLD=height*this.options.headerFromHeightPercentage;let FOOTER_THRESHOLD=height*this.options.footerFromHeightPercentage;return texts.filter((el)=>{let hasFontSize=el.metadata.font.size!==0;let notEmptySpace=el.text.trim()!=="";let isAfterHeader=el.bbox.y0>HEADER_THRESHOLD;let isBeforeFooter=el.bbox.y0<FOOTER_THRESHOLD;return hasFontSize&¬EmptySpace&&(!this.options.excludeHeader||isAfterHeader)&&(!this.options.excludeFooter||isBeforeFooter)}).map((el,id)=>({...el,id,text:!this.options.raw?this.normalizedText(el.text):el.text}))}getLinesFromTexts(pageTexts){return this.getLinesFromTextsCommon(pageTexts,this.startIndex)}getCompactLinesFromTexts(pageTexts,algorithm="middleY"){return this.getCompactLinesFromTextsCommon(pageTexts,algorithm,this.startIndex)}async dumpCanvasMap(canvasMap,filename,foldername="out"){this.dumpCanvasMapCommon(canvasMap,filename,foldername,this.startIndex)}isScanned(pageTexts,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){return this.isScannedCommon(pageTexts,options,this.startIndex)}destroy(doc){return doc.destroy()}destroyPage(page){return page.destroy()}}