UNPKG

pdf2json

Version:

PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js

3 lines (2 loc) 12.3 kB
import t from"node:fs";import e from"node:path";import s from"../../dist/pdfparser.js";const{pkInfo:r,_PARSER_SIG:i}=s;class n{args=[];aliases={};usageMsg="";examplesMsg="";parsedArgv=null;constructor(t){Array.isArray(t)&&(this.args=t)}usage(t){return this.usageMsg=`${t}\n\nOptions:\n`,this}alias(t,e,s){return this.aliases[t]={name:e,description:s},this}examples(t){return this.examplesMsg=t,this}showHelp(){let t=this.usageMsg;for(const[e,s]of Object.entries(this.aliases)){const{name:r,description:i}=s;t+=` -${e}, --${r}\t${i}\n`}this.examplesMsg&&(t+=this.examplesMsg),console.log(t)}get argv(){return this.parsedArgv?this.parsedArgv:this.parseArgv()}static isNumber(t){return"number"==typeof t||(!!/^0x[0-9a-f]+$/i.test(t)||/^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(e[-+]?\d+)?$/.test(t))}setArg(t,e,s){const r=n.isNumber(e)?Number(e):e;this.setKey(s,t.split("."),r);const i=t in this.aliases?[this.aliases[t].name]:[];if(i.length<1)for(const[e,s]of Object.entries(this.aliases))if(t===s.name){i.push(e);break}i.forEach(t=>this.setKey(s,t.split("."),r))}setKey(t,e,s){let r=t;for(let t=0;t<e.length-1;t++){const s=e[t];if("__proto__"===s)return;void 0===r[s]&&(r[s]={}),r[s]!==Object.prototype&&r[s]!==Number.prototype&&r[s]!==String.prototype||(r[s]={}),r[s]===Array.prototype&&(r[s]=[]),r=r[s]}const i=e[e.length-1];"__proto__"!==i&&(r!==Object.prototype&&r!==Number.prototype&&r!==String.prototype||(r={}),r===Array.prototype&&(r=[]),void 0===r[i]?r[i]=s:Array.isArray(r[i])?r[i].push(s):r[i]=[r[i],s])}resolveShortFlags(t){if(t in this.aliases)return[t];for(const[e,s]of Object.entries(this.aliases))if(t===s.name)return[e];if(t.length>1){const e=[];for(const s of t)s in this.aliases?e.push(s):console.warn(`Unknown short flag: -${s} (in -${t})`);return e}return 1===t.length?[t]:[]}parseArgv(){const{args:t}=this,e={};for(let s=0;s<t.length;s++){const r=t[s];if(/^--.+/.test(r)){const i=r.match(/^--(.+)/);if(!Array.isArray(i)){console.warn("Unknown CLI options:",r);continue}const n=i[1],o=t[s+1];void 0===o||/^-/.test(o)?/^(true|false)$/.test(o)?(this.setArg(n,"true"===o,e),s++):this.setArg(n,!0,e):(this.setArg(n,o,e),s++)}else if(/^-[^-]+/.test(r)){const i=r.slice(1),n=this.resolveShortFlags(i);if(1===n.length&&"-"!==n[0]){const r=n[0];t[s+1]&&!/^(-|--)[^-]/.test(t[s+1])?(this.setArg(r,t[s+1],e),s++):t[s+1]&&/^(true|false)$/.test(t[s+1])?(this.setArg(r,"true"===t[s+1],e),s++):this.setArg(r,!0,e)}else for(const t of n)this.setArg(t,!0,e)}else console.warn("Unknown CLI options:",r)}return this.parsedArgv=e,e}}const o=new n(process.argv.slice(2)).usage(`\n${i}\n\nUsage: ${r.name} -f <file_or_dir> [options]`).alias("f","file","(required) Path to a PDF file or a directory of PDF files to parse.").alias("o","output","Output directory for generated files. Created automatically if it\n\t\t\tdoes not exist. Defaults to the same directory as the input file.").alias("s","silent","Suppress informational output; only errors are printed.").alias("t","fieldTypes","Generate a .fields.json file with form field ids and types.").alias("c","content","Generate a .content.txt file with extracted text content.").alias("m","merge","Generate a .merged.json file with auto-merged broken text blocks.").alias("r","stream","Use stream-based parsing (read/transform/write pipeline)\n\t\t\tinstead of loading the entire file into memory first.").alias("si","singleton","Reuse a single PDFParser instance across all files in a\n\t\t\tdirectory (reduces memory allocation for batch processing).").alias("j","json","Output a structured JSON summary to stdout with version, file\n\t\t\tpaths, stats, and errors. Implies -s. Note: the PDF engine may\n\t\t\tprint warnings to stdout; pipe through `grep '^{'` to isolate JSON.").alias("q","quiet","Suppress all non-error output, including the timer and status\n\t\t\tmessages. Stricter than -s.").alias("v","version","Print the version number and exit.").alias("h","help","Print this help message and exit.").examples("\nExamples:\n\n Parse a single PDF to JSON:\n pdf2json -f input.pdf\n\n Parse with a specific output directory:\n pdf2json -f input.pdf -o ./output\n\n Parse and generate all output formats (JSON + fields + text + merged):\n pdf2json -f input.pdf -o ./output -t -c -m\n\n Parse an entire directory of PDFs:\n pdf2json -f ./pdf_folder -o ./output -s\n\n Parse using stream mode (lower memory for large files):\n pdf2json -f input.pdf -o ./output -r\n\n Get structured JSON summary for scripting:\n pdf2json -f input.pdf -o ./output --json\n\n Batch directory parse, silent, all outputs:\n pdf2json -f ./pdf_folder -o ./output -s -t -c -m -r\n\nExit Codes:\n 0 All files parsed successfully\n 1 One or more files failed to parse\n 2 Invalid arguments or usage error\n 3 I/O error (file not found, permission denied)\n"),{ParserStream:a,StringifyStream:u,pkInfo:p,_PARSER_SIG:l}=s,{argv:h}=o,c="v"in h,d="h"in h,f="s"in h||"q"in h||"j"in h?0:5,g="f"in h,m="c"in h,P="t"in h,y="m"in h,S="r"in h,C="si"in h,w="j"in h,x="q"in h,F=h.f,$=x||w;function v(...t){$||console.log(...t)}function j(...t){$||console.warn(...t)}class D{inputDir="";inputFile="";inputPath="";outputDir="";outputFile="";outputPath="";pdfParser=null;_curCLI=null;get curCLI(){return this._curCLI}constructor(t,s,r,i){this.inputDir=e.normalize(t),this.inputFile=s,this.inputPath=e.join(this.inputDir,this.inputFile),this.outputDir=e.normalize(h.o||t),this.pdfParser=i||null,this._curCLI=r}generateMergedTextBlocksStream(){return new Promise((t,e)=>{if(!this.pdfParser)return void e(new Error("PDFParser instance is not available."));const s=a.createOutputStream(this.outputPath.replace(".json",".merged.json"),t,e);this.pdfParser.getMergedTextBlocksStream().pipe(new u).pipe(s)})}generateRawTextContentStream(){return new Promise((t,e)=>{const s=a.createOutputStream(this.outputPath.replace(".json",".content.txt"),t,e);this.pdfParser.getRawTextContentStream().pipe(s)})}generateFieldsTypesStream(){return new Promise((t,e)=>{const s=a.createOutputStream(this.outputPath.replace(".json",".fields.json"),t,e);this.pdfParser.getAllFieldsTypesStream().pipe(new u).pipe(s)})}processAdditionalStreams(){const t=[];return P&&t.push(this.generateFieldsTypesStream()),m&&t.push(this.generateRawTextContentStream()),y&&t.push(this.generateMergedTextBlocksStream()),Promise.allSettled(t)}initParser(){this.pdfParser||(this.pdfParser=new s(null,m))}parseOnePDFStream(){return new Promise((e,s)=>{this.initParser(),this.pdfParser.once("pdfParser_dataError",t=>{this.curCLI.addResultCount(!0),s(t.parserError)});const r=t.createWriteStream(this.outputPath,{encoding:"utf8"});r.on("finish",()=>{this.curCLI.addResultCount(!1),this.processAdditionalStreams().then(t=>e(t)).catch(t=>s(t))}),r.on("error",t=>{this.curCLI.addResultCount(!0),s(t)}),v(`Transcoding Stream ${this.inputFile} to - ${this.outputPath}`);t.createReadStream(this.inputPath).pipe(this.pdfParser.createParserStream()).pipe(new u).pipe(r)})}parseOnePDF(){return new Promise((e,s)=>{this.initParser(),this.pdfParser.once("pdfParser_dataError",t=>{this.curCLI.addResultCount(!0),s(t.parserError)}),this.pdfParser.once("pdfParser_dataReady",async r=>{try{await t.promises.writeFile(this.outputPath,JSON.stringify(r),"utf8"),this.curCLI.addResultCount(!1);const s=await this.processAdditionalStreams();e(s)}catch(t){this.curCLI.addResultCount(!0),s(t)}}),v(`Transcoding File ${this.inputFile} to - ${this.outputPath}`),this.pdfParser.loadPDF(this.inputPath,f)})}async validateParams(){let s="";if(t.existsSync(this.inputDir))if(t.existsSync(this.inputPath)){if(!t.existsSync(this.outputDir))try{await t.promises.mkdir(this.outputDir,{recursive:!0})}catch{s=`Input error: output directory doesn't exist and fails to create - ${this.outputDir}.`}}else s=`Input error: input file doesn't exist - ${this.inputPath}.`;else s=`Input error: input directory doesn't exist - ${this.inputDir}.`;if(""!==s)return this.curCLI.addResultCount(!0),s;const r=e.extname(this.inputFile).toLowerCase();return".pdf"!==r?s=`Input error: input file name doesn't have pdf extension - ${this.inputFile}.`:(this.outputFile=`${e.basename(this.inputPath,r)}.json`,this.outputPath=e.normalize(`${this.outputDir}/${this.outputFile}`),t.existsSync(this.outputPath)&&j(`Output file will be replaced - ${this.outputPath}`)),s}destroy(){this.inputDir="",this.inputFile="",this.inputPath="",this.outputDir="",this.outputPath="",this.pdfParser&&!C&&this.pdfParser.destroy(),this.pdfParser=null,this._curCLI=null}async processFile(){const t=await this.validateParams();if(""!==t)throw new Error(t);return S?this.parseOnePDFStream():this.parseOnePDF()}getOutputFile=()=>e.join(this.outputDir,this.outputFile)}class O{inputCount=0;successCount=0;failedCount=0;statusMsgs=[];outputPaths=[];errorMessages=[];startTime=0;sharedParser=null;constructor(){this.inputCount=0,this.successCount=0,this.failedCount=0,this.statusMsgs=[],this.outputPaths=[],this.errorMessages=[]}initialize(){try{return c?(console.log(p.version),{success:!1}):d?(o.showHelp(),{success:!1}):g?Array.isArray(F)?{success:!1,exitCode:2,error:`-f|--file parameter can only be specified once. Received multiple values: ${F.join(", ")}`}:"string"!=typeof F||""===F.trim()?{success:!1,exitCode:2,error:"-f|--file parameter must have a valid path value."}:t.existsSync(F)?{success:!0}:{success:!1,exitCode:3,error:`Input path does not exist: ${F}`}:{success:!1,exitCode:2,error:"-f|--file parameter is required to specify input directory or file."}}catch(t){return{success:!1,exitCode:2,error:`Exception during initialization: ${(t instanceof Error?t:new Error(String(t))).message}`}}}async start(){const r=this.initialize();r.success||(r.error&&(o.showHelp(),console.error(`\nError: ${r.error}`),process.exit(r.exitCode??2)),process.exit(0)),this.startTime=Date.now(),v(l),$||console.time(l),C&&(this.sharedParser=new s(null,m));let i,n=!1,a=0;try{const s=t.statSync(F);s.isFile()?(this.inputCount=1,await this.processOneFile(e.dirname(F),e.basename(F))):s.isDirectory()&&await this.processOneDirectory(e.normalize(F))}catch(t){n=!0;i=`Exception during processing: ${(t instanceof Error?t:new Error(String(t))).message}`,this.addStatusMsg(!0,i),this.errorMessages.push(i),this.failedCount++;const e=t.code;a="ENOENT"===e||"EACCES"===e||"EPERM"===e?3:1}finally{0===a&&this.failedCount>0&&(a=1),this.complete(n,i,a)}}complete(t=!1,e,s=0){if(w){const t={version:p.version,input:F,outputs:this.outputPaths,stats:{input:this.inputCount,success:this.successCount,failed:this.failedCount},errors:this.errorMessages,elapsedMs:Date.now()-this.startTime};console.log(JSON.stringify(t))}else{const s=t||this.failedCount>0?console.error:v;e&&s(`\nError: ${e}`),this.statusMsgs.length>0&&s(this.statusMsgs),s(`\n${this.inputCount} input files\t${this.successCount} success\t${this.failedCount} fail`)}this.sharedParser&&(this.sharedParser.destroy(),this.sharedParser=null),process.nextTick(()=>{$||console.timeEnd(l),process.exit(s)})}async processOneFile(t,s){const r=new D(t,s,this,this.sharedParser);try{const i=await r.processFile(),n=r.getOutputFile();return this.addStatusMsg(!1,`${e.join(t,s)} => ${n}`),this.outputPaths.push({type:"json",path:n}),i.forEach(t=>{if("fulfilled"===t.status&&t.value){const s=String(t.value);this.addStatusMsg(!1,`+ ${s}`);const r=e.extname(s);let i="unknown";".json"===r&&s.includes(".fields.")?i="fields":".json"===r&&s.includes(".merged.")?i="merged":".txt"===r&&(i="content"),this.outputPaths.push({type:i,path:s})}}),i}catch(r){const i=r instanceof Error?r.message:String(r);throw this.addStatusMsg(!0,`${e.join(t,s)} => ${i}`),this.errorMessages.push(i),r}finally{r.destroy()}}processFiles(t,e){const s=[];return e.forEach(e=>s.push(this.processOneFile(t,e))),Promise.allSettled(s)}async processOneDirectory(e){const s=(await t.promises.readdir(e)).filter(t=>".pdf"===t.slice(-4).toLowerCase()&&(!t.startsWith(".")||(j(`Skipping hidden file: ${t}`),!1)));return this.inputCount=s.length,this.inputCount>0?this.processFiles(e,s):(this.addStatusMsg(!0,`[${e}] - No PDF files found`),[])}addStatusMsg(t,e){this.statusMsgs.push(t?`✗ Error : ${e}`:`✓ Success : ${e}`)}addResultCount(t){t?this.failedCount++:this.successCount++}}export{O as default}; //# sourceMappingURL=pdfparser_cli.js.map