pdf2json
Version:
PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js
3 lines (2 loc) • 10.1 kB
JavaScript
import t from"node:fs";import e from"node:path";import s from"../../dist/pdfparser.js";const{pkInfo:i,_PARSER_SIG:r}=s;class n{args=[];aliases={};usageMsg="";parsedArgv=null;constructor(t){Array.isArray(t)&&(this.args=t)}usage(t){return this.usageMsg=`${t}\n\nOptions:\n`,this}alias(t,e,s){return this.aliases[t]={name:e,description:s},this}showHelp(){let t=this.usageMsg;for(const[e,s]of Object.entries(this.aliases)){const{name:i,description:r}=s;t+=`-${e},--${i}\t ${r}\n`}console.log(t)}get argv(){return this.parsedArgv?this.parsedArgv:this.parseArgv()}static isNumber(t){return"number"==typeof t||(!!/^0x[0-9a-f]+$/i.test(t)||/^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(e[-+]?\d+)?$/.test(t))}setArg(t,e,s){const i=n.isNumber(e)?Number(e):e;this.setKey(s,t.split("."),i);const r=t in this.aliases?[this.aliases[t].name]:[];if(r.length<1)for(const[e,s]of Object.entries(this.aliases))if(t===s.name){r.push(e);break}r.forEach(t=>this.setKey(s,t.split("."),i))}setKey(t,e,s){let i=t;for(let t=0;t<e.length-1;t++){const s=e[t];if("__proto__"===s)return;void 0===i[s]&&(i[s]={}),i[s]!==Object.prototype&&i[s]!==Number.prototype&&i[s]!==String.prototype||(i[s]={}),i[s]===Array.prototype&&(i[s]=[]),i=i[s]}const r=e[e.length-1];"__proto__"!==r&&(i!==Object.prototype&&i!==Number.prototype&&i!==String.prototype||(i={}),i===Array.prototype&&(i=[]),void 0===i[r]?i[r]=s:Array.isArray(i[r])?i[r].push(s):i[r]=[i[r],s])}parseArgv(){const{args:t}=this,e={};for(let s=0;s<t.length;s++){const i=t[s];if(/^--.+/.test(i)){const r=i.match(/^--(.+)/);if(!Array.isArray(r)){console.warn("Unknow CLI options:",i);continue}const n=r[1],o=t[s+1];void 0===o||/^-/.test(o)?/^(true|false)$/.test(o)?(this.setArg(n,"true"===o,e),s++):this.setArg(n,!0,e):(this.setArg(n,o,e),s++)}else if(/^-[^-]+/.test(i)){const r=i.slice(-1)[0];"-"!==r&&(t[s+1]&&!/^(-|--)[^-]/.test(t[s+1])?(this.setArg(r,t[s+1],e),s++):t[s+1]&&/^(true|false)$/.test(t[s+1])?(this.setArg(r,"true"===t[s+1],e),s++):this.setArg(r,!0,e))}else console.warn("Unknow CLI options:",i)}return this.parsedArgv=e,e}}const o=new n(process.argv.slice(2)).usage(`\n${r}\n\nUsage: ${i.name} -f|--file [-o|output_dir]`).alias("v","version","Display version.").alias("h","help","Display brief help information.").alias("f","file","(required) Full path of input PDF file or a directory to scan for all PDF files.\n\t\t When specifying a PDF file name, it must end with .PDF, otherwise it would be treated as a input directory.").alias("o","output","(optional) Full path of output directory, must already exist.\n\t\t Current JSON file in the output folder will be replaced when file name is same.").alias("s","silent","(optional) when specified, will only log errors, otherwise verbose.").alias("t","fieldTypes","(optional) when specified, will generate .fields.json that includes fields ids and types.").alias("c","content","(optional) when specified, will generate .content.txt that includes text content from PDF.").alias("m","merge","(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF.").alias("r","stream","(optional) when specified, will process and parse with buffer/object transform stream rather than file system.").alias("si","singleton","(optional) when specified, only an instance of PDFParser will be initialized."),{ParserStream:a,StringifyStream:u,pkInfo:l,_PARSER_SIG:p}=s,{argv:h}=o,c="v"in h,d="h"in h,f="s"in h?0:5,m="f"in h,g="c"in h,P="t"in h,y="m"in h,w="r"in h,S="si"in h,$=h.f;class F{inputDir="";inputFile="";inputPath="";outputDir="";outputFile="";outputPath="";pdfParser=null;curCLI=null;constructor(t,s,i){this.inputDir=e.normalize(t),this.inputFile=s,this.inputPath=e.join(this.inputDir,this.inputFile),this.outputDir=e.normalize(h.o||t),this.pdfParser=null,this.curCLI=i}generateMergedTextBlocksStream(){return new Promise((t,e)=>{if(!this.pdfParser)return void e("PDFParser instance is not available.");const s=a.createOutputStream(this.outputPath.replace(".json",".merged.json"),t,e);this.pdfParser.getMergedTextBlocksStream().pipe(new u).pipe(s)})}generateRawTextContentStream(){return new Promise((t,e)=>{const s=a.createOutputStream(this.outputPath.replace(".json",".content.txt"),t,e);this.pdfParser.getRawTextContentStream().pipe(s)})}generateFieldsTypesStream(){return new Promise((t,e)=>{const s=a.createOutputStream(this.outputPath.replace(".json",".fields.json"),t,e);this.pdfParser.getAllFieldsTypesStream().pipe(new u).pipe(s)})}processAdditionalStreams(){const t=[];return P&&t.push(this.generateFieldsTypesStream()),g&&t.push(this.generateRawTextContentStream()),y&&t.push(this.generateMergedTextBlocksStream()),Promise.allSettled(t)}onPrimarySuccess(t,e){this.curCLI.addResultCount(!1),this.processAdditionalStreams().then(e=>t(e)).catch(t=>e(t))}onPrimaryError(t,e){this.curCLI.addResultCount(t),e(t)}parseOnePDFStream(){return new Promise((e,i)=>{(S&&!this.pdfParser||!S)&&(this.pdfParser=new s(null,g),this.pdfParser.on("pdfParser_dataError",t=>this.onPrimaryError(t.parserError,i)));const r=t.createWriteStream(this.outputPath,{encoding:"utf8"});r.on("finish",()=>this.onPrimarySuccess(e,i)),r.on("error",t=>this.onPrimaryError(t,i)),console.info(`Transcoding Stream ${this.inputFile} to - ${this.outputPath}`);t.createReadStream(this.inputPath).pipe(this.pdfParser.createParserStream()).pipe(new u).pipe(r)})}parseOnePDF(){return new Promise((e,i)=>{(S&&!this.pdfParser||!S)&&(this.pdfParser=new s(null,g),this.pdfParser.on("pdfParser_dataError",t=>this.onPrimaryError(t.parserError,i))),this.pdfParser.on("pdfParser_dataReady",s=>{t.writeFile(this.outputPath,JSON.stringify(s),"utf8",t=>{t?this.onPrimaryError(t,i):this.onPrimarySuccess(e,i)})}),console.info(`Transcoding File ${this.inputFile} to - ${this.outputPath}`),this.pdfParser.loadPDF(this.inputPath,f)})}async validateParams(){let s="";if(t.existsSync(this.inputDir))if(t.existsSync(this.inputPath)){if(!t.existsSync(this.outputDir))try{await t.promises.mkdir(this.outputDir,{recursive:!0})}finally{t.existsSync(this.outputDir)||(s=`Input error: output directory doesn't exist and fails to create - ${this.outputDir}.`)}}else s=`Input error: input file doesn't exist - ${this.inputPath}.`;else s=`Input error: input directory doesn't exist - ${this.inputDir}.`;if(""!==s)return this.curCLI.addResultCount(s),s;const i=e.extname(this.inputFile).toLowerCase();if(".pdf"!==i)s=`Input error: input file name doesn't have pdf extention - ${this.inputFile}.`;else if(this.outputFile=`${e.basename(this.inputPath,i)}.json`,this.outputPath=e.normalize(`${this.outputDir}/${this.outputFile}`),t.existsSync(this.outputPath))console.warn(`Output file will be replaced - ${this.outputPath}`);else{const e=t.openSync(this.outputPath,"wx");e?(t.closeSync(e),t.unlinkSync(this.outputPath)):s=`Input error: can not write to ${this.outputPath}`}return s}destroy(){this.inputDir="",this.inputFile="",this.inputPath="",this.outputDir="",this.outputPath="",this.pdfParser&&this.pdfParser.destroy(),this.pdfParser=null,this.curCLI=null}processFile(){return new Promise((t,e)=>{this.validateParams().then(s=>{if(""!==s)e(s);else{(w?this.parseOnePDFStream:this.parseOnePDF).call(this).then(e=>t(e)).catch(t=>e(t))}}).catch(t=>e(t))})}getOutputFile=()=>e.join(this.outputDir,this.outputFile)}class C{inputCount=0;successCount=0;failedCount=0;warningCount=0;statusMsgs=[];constructor(){this.inputCount=0,this.successCount=0,this.failedCount=0,this.warningCount=0,this.statusMsgs=[]}initialize(){try{return c?(console.log(l.version),{success:!1}):d?(o.showHelp(),{success:!1}):m?"string"!=typeof $||""===$.trim()?{success:!1,error:"-f|--file parameter must have a valid path value."}:Array.isArray($)?{success:!1,error:`-f|--file parameter can only be specified once. Received multiple values: ${$.join(", ")}`}:t.existsSync($)?{success:!0}:{success:!1,error:`Input path does not exist: ${$}`}:{success:!1,error:"-f|--file parameter is required to specify input directory or file."}}catch(t){return{success:!1,error:`Exception during initialization: ${(t instanceof Error?t:new Error(String(t))).message}`}}}async start(){const s=this.initialize();s.success||(s.error&&(o.showHelp(),console.error(`\nError: ${s.error}`),process.exit(1)),process.exit(0)),console.log(p),console.time(p);let i,r=!1;try{const s=t.statSync($);s.isFile()?(this.inputCount=1,await this.processOneFile(e.dirname($),e.basename($))):s.isDirectory()&&await this.processOneDirectory(e.normalize($))}catch(t){r=!0;i=`Exception during processing: ${(t instanceof Error?t:new Error(String(t))).message}`,this.addStatusMsg(!0,i),this.failedCount++}finally{this.complete(r,i)}}complete(t=!1,e){const s=t||this.failedCount>0?console.error:console.log;e&&s(`\nError: ${e}`),this.statusMsgs.length>0&&s(this.statusMsgs),s(`\n${this.inputCount} input files\t${this.successCount} success\t${this.failedCount} fail\t${this.warningCount} warning`),process.nextTick(()=>{console.timeEnd(p),(t||this.failedCount>0)&&process.exit(1)})}processOneFile(t,s){return new Promise((i,r)=>{const n=new F(t,s,this);n.processFile().then(r=>{const o=r;this.addStatusMsg(!1,`${e.join(t,s)} => ${n.getOutputFile()}`),o.forEach(t=>{"fulfilled"===t.status&&this.addStatusMsg(!1,`+ ${t.value}`)}),i(o)}).catch(i=>{this.addStatusMsg(i,`${e.join(t,s)} => ${i}`),r(i)}).finally(()=>n.destroy())})}processFiles(t,e){const s=[];return e.forEach((e,i)=>s.push(this.processOneFile(t,e))),Promise.allSettled(s)}processOneDirectory(e){return new Promise((s,i)=>{t.readdir(e,(t,r)=>{if(t)this.addStatusMsg(!0,`[${e}] - ${t.toString()}`),i(t);else{const t="!@#$%^&*()+=[]\\';,/{}|\":<>?~`.-_ ",n=r.filter(e=>".pdf"===e.slice(-4).toLowerCase()&&t.indexOf(e.substring(0,1))<0);this.inputCount=n.length,this.inputCount>0?this.processFiles(e,n).then(t=>s(t)).catch(t=>i(t)):(this.addStatusMsg(!0,`[${e}] - No PDF files found`),s("no pdf files found"))}})})}addStatusMsg(t,e){this.statusMsgs.push(t?`✗ Error : ${e}`:`✓ Success : ${e}`)}addResultCount(t){t?this.failedCount++:this.successCount++}}export{C as default};
//# sourceMappingURL=pdfparser_cli.js.map