ppu-pdf
Version:
Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.
4 lines • 9.82 kB
JavaScript
import"./mupdf-workaround.js";import{createCanvas,GlobalFonts,ImageData}from"@napi-rs/canvas";import{existsSync,readFileSync}from"fs";import{PdfReaderCommon}from"./pdf-reader-common.js";import{CONSTANT,PDF_READER_DEFAULT_OPTIONS}from"./pdf.constant.js";let mupdf=await import("mupdf");export class PdfReader extends PdfReaderCommon{options;startIndex=0;constructor(options={}){super();this.options={...PDF_READER_DEFAULT_OPTIONS,...options};if(this.options.fonts.length){for(let f of this.options.fonts){if(!existsSync(f.path))throw new Error(`Invalid font path: [${f.name}] ${f}`);GlobalFonts.registerFromPath(f.path,f.name)}}}open(filename){let data;if(typeof filename=="string"){data=new Uint8Array(readFileSync(filename))}else{data=new Uint8Array(filename)}return mupdf.PDFDocument.openDocument(data,"application/pdf")}async renderAll(doc,dpi=72){let canvasMap=new Map;let numOfPages=doc.countPages();let renderPromises=Array.from({length:numOfPages},(_,i)=>{let page=doc.loadPage(i);return this.getCanvas(canvasMap,i,page,dpi)});await Promise.all(renderPromises);return canvasMap}async getTextsScanned(paddleOcrService,canvasMap){if(!paddleOcrService.isInitialized()){await paddleOcrService.initialize()}let pages=new Map;let numOfPages=canvasMap.size;let ocrPromises=[];for(let i=this.startIndex;i<numOfPages;i++){let canvas=canvasMap.get(i);if(canvas){ocrPromises.push(this.extractOcrTexts(pages,i,canvas,paddleOcrService))}}await Promise.all(ocrPromises);return pages}async getCanvas(canvasMap,pageNum,page,dpi){let pageDimension=page.getBounds();let scaleFactor=mupdf.Matrix.scale(dpi/72,dpi/72);let bbox=mupdf.Rect.transform(pageDimension,scaleFactor);let pixmap=new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB,bbox,false);pixmap.clear(255);let device=new mupdf.DrawDevice(scaleFactor,pixmap);page.run(device,mupdf.Matrix.identity);device.close();page.destroy();let width=pixmap.getWidth();let height=pixmap.getHeight();let pixels3=new Uint8ClampedArray(pixmap.getPixels());let pixels4=new Uint8ClampedArray(width*height*4);for(let i=0,j=0;i<pixels3.length;i+=3,j+=4){pixels4[j]=pixels3[i];pixels4[j+1]=pixels3[i+1];pixels4[j+2]=pixels3[i+2];pixels4[j+3]=255}let imageData=new ImageData(pixels4,width,height);imageData.colorSpace="srgb";let canvas=createCanvas(pageDimension[2],pageDimension[3]);let context=canvas.getContext("2d");canvas.width=imageData.width;canvas.height=imageData.height;context.putImageData(imageData,0,0);canvasMap.set(pageNum,canvas)}async getTexts(doc){let pages=new Map;let numOfPages=doc.countPages();let getTextContentPromises=[];for(let i=this.startIndex;i<numOfPages;i++){let page=doc.loadPage(i);getTextContentPromises.push(this.extractTexts(pages,i,page))}await Promise.all(getTextContentPromises);return pages}async extractTexts(linesMap,pageNum,page){const[,,,height]=page.getBounds();let docStructure=JSON.parse(page.toStructuredText("ignore-actualtext,collect-styles").asJSON());page.destroy();let textsMapped=this.mapStructureToPdfWord(docStructure,pageNum);let textsSorted=this.options.simpleSortAlgorithm?this.sortTextContentSimple(textsMapped):this.sortTextContent(textsMapped);if(!this.options.raw){textsSorted=this.removeFakeBold(textsSorted)}let textsMerged=this.options.mergeCloseTextNeighbor?this.mergeTextContent(textsSorted):textsSorted;let textsFiltered=this.filterTextContent(textsMerged,height);let fullText=textsFiltered.map((word)=>word.text).join(" ");linesMap.set(pageNum,{words:textsFiltered,fullText,confidence:1,toon:this.getToonWords(textsFiltered,this.options.enableToon)})}async extractOcrTexts(linesMap,pageNum,canvas,paddleOcrService){try{let ocrResult=await paddleOcrService.recognize(canvas);let pdfWords=this.convertOcrToPdfWords(ocrResult,pageNum);let textsSorted=this.options.simpleSortAlgorithm?this.sortTextContentSimple(pdfWords):this.sortTextContent(pdfWords);if(!this.options.raw){textsSorted=this.removeFakeBold(textsSorted)}let textsMerged=this.options.mergeCloseTextNeighbor?this.mergeTextContent(textsSorted):textsSorted;let canvasHeight=canvas.height;let textsFiltered=this.filterTextContent(textsMerged,canvasHeight);let fullText=textsFiltered.map((word)=>word.text).join(" ");linesMap.set(pageNum,{words:textsFiltered,fullText,confidence:ocrResult.confidence,toon:this.getToonWords(textsFiltered,this.options.enableToon)})}catch(error){if(this.options.verbose){console.warn(`OCR failed for page ${pageNum}:`,error)}linesMap.set(pageNum,{words:[],fullText:"",confidence:0,toon:""})}}convertOcrToPdfWords(ocrResult,pageNum){if(!ocrResult?.lines||!Array.isArray(ocrResult.lines)){return[]}return ocrResult.lines.flatMap((line)=>{if(!Array.isArray(line))return[];return line.map((recognition)=>{const{x,y,width,height}=recognition.box;return{text:recognition.text,bbox:{x0:Math.round(x),y0:Math.round(y),x1:Math.round(x+width),y1:Math.round(y+height)},dimension:{width:Math.round(width),height:Math.round(height)},metadata:{writing:"",direction:"",font:{name:"",size:height,family:"",weight:"",style:""},hasEOL:false,pageNum}}})})}mapStructureToPdfWord(structure,pageNum){let pdfWords=[];let rawTexts=structure.blocks.map((el)=>el.lines).flat();for(let item of rawTexts){const{x,y,w,h}=item.bbox;let font=item.font;let pdfWord={text:item.text,bbox:{x0:Math.round(x),y0:Math.round(y),x1:Math.round(x+w),y1:Math.round(y+h)},dimension:{width:Math.round(w),height:Math.round(h)},metadata:{writing:item.wmode==0?"horizontal":"vertical",direction:"",font,hasEOL:false,pageNum}};pdfWords.push(pdfWord)}return pdfWords}mergeTextContent(texts){let result=[];let currentGroup=null;let UNORDERED_LIST=["•","-","◦","▪","▫"];for(let content of texts){const{text,dimension,metadata,bbox}=content;if(text===""&&dimension.width===0)continue;if(text==" "&&metadata.font.size==0)continue;if(!currentGroup){currentGroup={...content};continue}let prevMiddleY=(currentGroup.bbox.y0+currentGroup.bbox.y1)/2;let isWithinXRange=bbox.x0<=currentGroup.bbox.x1+currentGroup.metadata.font.size;let isWithinYRange=content.bbox.y0<=prevMiddleY&&prevMiddleY<=bbox.y1;let hasSameFontSize=Math.abs(metadata.font.size-currentGroup.metadata.font.size)<0.01;let isLeadingGroupAnUnorderedList=isWithinYRange&¤tGroup.text.trim().length==1&&UNORDERED_LIST.includes(currentGroup.text.trim());if(isLeadingGroupAnUnorderedList||isWithinXRange&&isWithinYRange&&hasSameFontSize){currentGroup={text:currentGroup.text+(bbox.x0-currentGroup.bbox.x1<1?"":" ")+text,dimension:{width:bbox.x1-currentGroup.bbox.x0,height:Math.max(currentGroup.dimension.height,content.dimension.height)},bbox:{x0:currentGroup.bbox.x0,y0:Math.min(currentGroup.bbox.y0,bbox.y0),x1:bbox.x1,y1:Math.max(currentGroup.bbox.y1,bbox.y1)},metadata:{writing:metadata.writing,direction:"",font:isLeadingGroupAnUnorderedList?metadata.font:currentGroup.metadata.font,hasEOL:false,pageNum:metadata.pageNum}}}else{result.push(currentGroup);currentGroup={...content}}}if(currentGroup){result.push(currentGroup)}return result}filterTextContent(texts,height){let HEADER_THRESHOLD=height*this.options.headerFromHeightPercentage;let FOOTER_THRESHOLD=height*this.options.footerFromHeightPercentage;return texts.filter((el)=>{let hasFontSize=el.metadata.font.size!==0;let notEmptySpace=el.text.trim()!=="";let isAfterHeader=el.bbox.y0>HEADER_THRESHOLD;let isBeforeFooter=el.bbox.y0<FOOTER_THRESHOLD;return hasFontSize&¬EmptySpace&&(!this.options.excludeHeader||isAfterHeader)&&(!this.options.excludeFooter||isBeforeFooter)}).map((el,id)=>({...el,id,text:!this.options.raw?this.normalizedText(el.text):el.text}))}getLinesFromTexts(pageTexts){return this.getLinesFromTextsCommon(pageTexts,this.startIndex)}getLinesFromTextsInToon(pageTexts){return this.getLinesFromTextsInToonCommon(pageTexts,this.startIndex)}getCompactLinesFromTexts(pageTexts,algorithm="middleY"){return this.getCompactLinesFromTextsCommon(pageTexts,algorithm,this.startIndex)}async dumpCanvasMap(canvasMap,filename,foldername="out"){this.dumpCanvasMapCommon(canvasMap,filename,foldername,this.startIndex)}isScanned(pageTexts,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){return this.isScannedCommon(pageTexts,options,this.startIndex)}isPageScanned(pageText,options={wordsPerPage:CONSTANT.WORDS_PER_PAGE_THRESHOLD,textLength:CONSTANT.TEXT_LENGTH_THRESHOLD}){return this.isPageScannedCommon(pageText,options)}async rebuild(doc,pageTexts,options={}){let pdf=doc;for(const[pageNum,pageText]of pageTexts.entries()){let page=pdf.loadPage(pageNum);let pageObj=page.getObject();let font=new mupdf.Font(options.fontName||"Helvetica");let fontResource=pdf.addCJKFont(font,"ja",0,false);let res=pageObj.get("Resources");if(!res.isDictionary()){res=pdf.newDictionary();pageObj.put("Resources",res)}let resFont=res.get("Font");if(!resFont.isDictionary()){resFont=pdf.newDictionary();res.put("Font",resFont)}resFont.put("F1",fontResource);let pageBounds=page.getBounds();let pageHeight=pageBounds[3]-pageBounds[1];let contentStream=`q 3 Tr
`;for(let word of pageText.words){let x=word.bbox.x0;let y=pageHeight-word.bbox.y1;let fontSize=word.metadata.font.size||word.dimension.height;let hexString="";for(let i=0;i<word.text.length;i++){let hex=word.text.charCodeAt(i).toString(16).padStart(4,"0");hexString+=hex}contentStream+=`BT /F1 ${fontSize} Tf ${x} ${y} Td <${hexString}> Tj ET
`}contentStream+=`Q
`;let extraContents=pdf.addStream(contentStream,{});let pageContents=pageObj.get("Contents");if(pageContents.isNull()){pageObj.put("Contents",extraContents)}else if(pageContents.isArray()){pageContents.push(extraContents)}else{let newPageContents=pdf.newArray();newPageContents.push(pageContents);newPageContents.push(extraContents);pageObj.put("Contents",newPageContents)}}return pdf.saveToBuffer("incremental").asUint8Array()}destroy(doc){return doc.destroy()}destroyPage(page){return page.destroy()}}