UNPKG

@langchain/community

Version:
1 lines 8.67 kB
{"version":3,"file":"pdf.cjs","names":["BaseDocumentLoader","Document"],"sources":["../../../src/document_loaders/web/pdf.ts"],"sourcesContent":["import { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\ntype PDFLoaderV1Imports = {\n isV2: false;\n getDocument: typeof import(\"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js\").getDocument;\n version: typeof import(\"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js\").version;\n};\n\ntype PDFLoaderV2Imports = {\n isV2: true;\n PDFParse: typeof import(\"pdf-parse\").PDFParse;\n};\n\ntype PDFLoaderImportsResult = PDFLoaderV1Imports | PDFLoaderV2Imports;\n\nconst PDF_PARSE_V1_IMPORT_PATH = \"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js\";\n\n/**\n * A document loader for loading data from PDFs.\n * @example\n * ```typescript\n * const loader = new WebPDFLoader(new Blob());\n * const docs = await loader.load();\n * console.log({ docs });\n * ```\n */\nexport class WebPDFLoader extends BaseDocumentLoader {\n protected blob: Blob;\n\n protected splitPages = true;\n\n private pdfjs: typeof PDFLoaderImports;\n\n protected parsedItemSeparator: string;\n\n constructor(\n blob: Blob,\n {\n splitPages = true,\n pdfjs = PDFLoaderImports,\n parsedItemSeparator = \"\",\n } = {}\n ) {\n super();\n this.blob = blob;\n this.splitPages = splitPages ?? this.splitPages;\n this.pdfjs = pdfjs;\n this.parsedItemSeparator = parsedItemSeparator;\n }\n\n /**\n * Loads the contents of the PDF as documents.\n * @returns An array of Documents representing the retrieved data.\n */\n async load(): Promise<Document[]> {\n const raw = new Uint8Array(await this.blob.arrayBuffer());\n const pdfjsResult = await this.pdfjs();\n\n if (pdfjsResult.isV2) {\n return this.parseWithV2(raw, pdfjsResult.PDFParse);\n }\n\n const { getDocument, version } = pdfjsResult;\n const parsedPdf = await getDocument({\n data: raw,\n useWorkerFetch: false,\n isEvalSupported: false,\n useSystemFonts: true,\n }).promise;\n const meta = await parsedPdf.getMetadata().catch(() => null);\n\n const documents: Document[] = [];\n\n for (let i = 1; i <= parsedPdf.numPages; i += 1) {\n const page = await parsedPdf.getPage(i);\n const content = await page.getTextContent();\n\n if (content.items.length === 0) {\n continue;\n }\n\n // Eliminate excessive newlines\n // Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16\n let lastY;\n const textItems = [];\n for (const item of content.items) {\n if (\"str\" in item) {\n if (lastY === item.transform[5] || !lastY) {\n textItems.push(item.str);\n } else {\n textItems.push(`\\n${item.str}`);\n }\n lastY = item.transform[5];\n }\n }\n const text = textItems.join(this.parsedItemSeparator);\n\n documents.push(\n new Document({\n pageContent: text,\n metadata: {\n pdf: {\n version,\n info: meta?.info,\n metadata: meta?.metadata,\n totalPages: parsedPdf.numPages,\n },\n loc: {\n pageNumber: i,\n },\n },\n })\n );\n }\n\n if (this.splitPages) {\n return documents;\n }\n\n if (documents.length === 0) {\n return [];\n }\n\n return [\n new Document({\n pageContent: documents.map((doc) => doc.pageContent).join(\"\\n\\n\"),\n metadata: {\n pdf: {\n version,\n info: meta?.info,\n metadata: meta?.metadata,\n totalPages: parsedPdf.numPages,\n },\n },\n }),\n ];\n }\n\n private async parseWithV2(\n raw: Uint8Array,\n PDFParseClass: typeof import(\"pdf-parse\").PDFParse\n ): Promise<Document[]> {\n const parser = new PDFParseClass({ data: raw });\n\n try {\n const textResult = await parser.getText();\n const infoResult = await parser.getInfo();\n\n const documents: Document[] = [];\n\n for (const page of textResult.pages) {\n if (!page.text || page.text.trim().length === 0) {\n continue;\n }\n\n documents.push(\n new Document({\n pageContent: page.text,\n metadata: {\n pdf: {\n version: infoResult.metadata?.format || \"unknown\",\n info: infoResult.info,\n metadata: infoResult.metadata,\n totalPages: textResult.total,\n },\n loc: {\n pageNumber: page.num,\n },\n },\n })\n );\n }\n\n if (this.splitPages) {\n return documents;\n }\n\n if (documents.length === 0) {\n return [];\n }\n\n return [\n new Document({\n pageContent: documents.map((doc) => doc.pageContent).join(\"\\n\\n\"),\n metadata: {\n pdf: {\n version: infoResult.metadata?.format || \"unknown\",\n info: infoResult.info,\n metadata: infoResult.metadata,\n totalPages: textResult.total,\n },\n },\n }),\n ];\n } finally {\n await parser.destroy();\n }\n }\n}\n\nasync function PDFLoaderImports(): Promise<PDFLoaderImportsResult> {\n try {\n const pdfParseModule = await import(\"pdf-parse\");\n if (\"PDFParse\" in pdfParseModule) {\n return { isV2: true as const, PDFParse: pdfParseModule.PDFParse };\n }\n } catch {\n // Fall back to the pdf-parse v1 import path below.\n }\n\n try {\n const { default: mod } = await import(\n /* @vite-ignore */ PDF_PARSE_V1_IMPORT_PATH\n );\n const { getDocument, version } = mod;\n return { isV2: false as const, getDocument, version };\n } catch (e) {\n console.error(e);\n throw new Error(\n \"Failed to load pdf-parse. Please install pdf-parse v1 or v2, e.g. `npm install pdf-parse@^1` or `npm install pdf-parse@^2`.\"\n );\n }\n}\n"],"mappings":";;;;;;AAgBA,MAAM,2BAA2B;;;;;;;;;;AAWjC,IAAa,eAAb,cAAkCA,sCAAAA,mBAAmB;CACnD;CAEA,aAAuB;CAEvB;CAEA;CAEA,YACE,MACA,EACE,aAAa,MACb,QAAQ,kBACR,sBAAsB,OACpB,EAAE,EACN;AACA,SAAO;AACP,OAAK,OAAO;AACZ,OAAK,aAAa,cAAc,KAAK;AACrC,OAAK,QAAQ;AACb,OAAK,sBAAsB;;;;;;CAO7B,MAAM,OAA4B;EAChC,MAAM,MAAM,IAAI,WAAW,MAAM,KAAK,KAAK,aAAa,CAAC;EACzD,MAAM,cAAc,MAAM,KAAK,OAAO;AAEtC,MAAI,YAAY,KACd,QAAO,KAAK,YAAY,KAAK,YAAY,SAAS;EAGpD,MAAM,EAAE,aAAa,YAAY;EACjC,MAAM,YAAY,MAAM,YAAY;GAClC,MAAM;GACN,gBAAgB;GAChB,iBAAiB;GACjB,gBAAgB;GACjB,CAAC,CAAC;EACH,MAAM,OAAO,MAAM,UAAU,aAAa,CAAC,YAAY,KAAK;EAE5D,MAAM,YAAwB,EAAE;AAEhC,OAAK,IAAI,IAAI,GAAG,KAAK,UAAU,UAAU,KAAK,GAAG;GAE/C,MAAM,UAAU,OADH,MAAM,UAAU,QAAQ,EAAE,EACZ,gBAAgB;AAE3C,OAAI,QAAQ,MAAM,WAAW,EAC3B;GAKF,IAAI;GACJ,MAAM,YAAY,EAAE;AACpB,QAAK,MAAM,QAAQ,QAAQ,MACzB,KAAI,SAAS,MAAM;AACjB,QAAI,UAAU,KAAK,UAAU,MAAM,CAAC,MAClC,WAAU,KAAK,KAAK,IAAI;QAExB,WAAU,KAAK,KAAK,KAAK,MAAM;AAEjC,YAAQ,KAAK,UAAU;;GAG3B,MAAM,OAAO,UAAU,KAAK,KAAK,oBAAoB;AAErD,aAAU,KACR,IAAIC,0BAAAA,SAAS;IACX,aAAa;IACb,UAAU;KACR,KAAK;MACH;MACA,MAAM,MAAM;MACZ,UAAU,MAAM;MAChB,YAAY,UAAU;MACvB;KACD,KAAK,EACH,YAAY,GACb;KACF;IACF,CAAC,CACH;;AAGH,MAAI,KAAK,WACP,QAAO;AAGT,MAAI,UAAU,WAAW,EACvB,QAAO,EAAE;AAGX,SAAO,CACL,IAAIA,0BAAAA,SAAS;GACX,aAAa,UAAU,KAAK,QAAQ,IAAI,YAAY,CAAC,KAAK,OAAO;GACjE,UAAU,EACR,KAAK;IACH;IACA,MAAM,MAAM;IACZ,UAAU,MAAM;IAChB,YAAY,UAAU;IACvB,EACF;GACF,CAAC,CACH;;CAGH,MAAc,YACZ,KACA,eACqB;EACrB,MAAM,SAAS,IAAI,cAAc,EAAE,MAAM,KAAK,CAAC;AAE/C,MAAI;GACF,MAAM,aAAa,MAAM,OAAO,SAAS;GACzC,MAAM,aAAa,MAAM,OAAO,SAAS;GAEzC,MAAM,YAAwB,EAAE;AAEhC,QAAK,MAAM,QAAQ,WAAW,OAAO;AACnC,QAAI,CAAC,KAAK,QAAQ,KAAK,KAAK,MAAM,CAAC,WAAW,EAC5C;AAGF,cAAU,KACR,IAAIA,0BAAAA,SAAS;KACX,aAAa,KAAK;KAClB,UAAU;MACR,KAAK;OACH,SAAS,WAAW,UAAU,UAAU;OACxC,MAAM,WAAW;OACjB,UAAU,WAAW;OACrB,YAAY,WAAW;OACxB;MACD,KAAK,EACH,YAAY,KAAK,KAClB;MACF;KACF,CAAC,CACH;;AAGH,OAAI,KAAK,WACP,QAAO;AAGT,OAAI,UAAU,WAAW,EACvB,QAAO,EAAE;AAGX,UAAO,CACL,IAAIA,0BAAAA,SAAS;IACX,aAAa,UAAU,KAAK,QAAQ,IAAI,YAAY,CAAC,KAAK,OAAO;IACjE,UAAU,EACR,KAAK;KACH,SAAS,WAAW,UAAU,UAAU;KACxC,MAAM,WAAW;KACjB,UAAU,WAAW;KACrB,YAAY,WAAW;KACxB,EACF;IACF,CAAC,CACH;YACO;AACR,SAAM,OAAO,SAAS;;;;AAK5B,eAAe,mBAAoD;AACjE,KAAI;EACF,MAAM,iBAAiB,MAAM,OAAO;AACpC,MAAI,cAAc,eAChB,QAAO;GAAE,MAAM;GAAe,UAAU,eAAe;GAAU;SAE7D;AAIR,KAAI;EACF,MAAM,EAAE,SAAS,QAAQ,MAAM;;GACV;;EAErB,MAAM,EAAE,aAAa,YAAY;AACjC,SAAO;GAAE,MAAM;GAAgB;GAAa;GAAS;UAC9C,GAAG;AACV,UAAQ,MAAM,EAAE;AAChB,QAAM,IAAI,MACR,8HACD"}