@langchain/community
Version:
Third-party integrations for LangChain.js
152 lines (151 loc) • 4.74 kB
JavaScript
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
const require_runtime = require("../../_virtual/_rolldown/runtime.cjs");
let _langchain_core_documents = require("@langchain/core/documents");
let _langchain_core_document_loaders_base = require("@langchain/core/document_loaders/base");
//#region src/document_loaders/web/pdf.ts
var pdf_exports = /* @__PURE__ */ require_runtime.__exportAll({ WebPDFLoader: () => WebPDFLoader });
const PDF_PARSE_V1_IMPORT_PATH = "pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js";
/**
* A document loader for loading data from PDFs.
* @example
* ```typescript
* const loader = new WebPDFLoader(new Blob());
* const docs = await loader.load();
* console.log({ docs });
* ```
*/
var WebPDFLoader = class extends _langchain_core_document_loaders_base.BaseDocumentLoader {
blob;
splitPages = true;
pdfjs;
parsedItemSeparator;
constructor(blob, { splitPages = true, pdfjs = PDFLoaderImports, parsedItemSeparator = "" } = {}) {
super();
this.blob = blob;
this.splitPages = splitPages ?? this.splitPages;
this.pdfjs = pdfjs;
this.parsedItemSeparator = parsedItemSeparator;
}
/**
* Loads the contents of the PDF as documents.
* @returns An array of Documents representing the retrieved data.
*/
async load() {
const raw = new Uint8Array(await this.blob.arrayBuffer());
const pdfjsResult = await this.pdfjs();
if (pdfjsResult.isV2) return this.parseWithV2(raw, pdfjsResult.PDFParse);
const { getDocument, version } = pdfjsResult;
const parsedPdf = await getDocument({
data: raw,
useWorkerFetch: false,
isEvalSupported: false,
useSystemFonts: true
}).promise;
const meta = await parsedPdf.getMetadata().catch(() => null);
const documents = [];
for (let i = 1; i <= parsedPdf.numPages; i += 1) {
const content = await (await parsedPdf.getPage(i)).getTextContent();
if (content.items.length === 0) continue;
let lastY;
const textItems = [];
for (const item of content.items) if ("str" in item) {
if (lastY === item.transform[5] || !lastY) textItems.push(item.str);
else textItems.push(`\n${item.str}`);
lastY = item.transform[5];
}
const text = textItems.join(this.parsedItemSeparator);
documents.push(new _langchain_core_documents.Document({
pageContent: text,
metadata: {
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: parsedPdf.numPages
},
loc: { pageNumber: i }
}
}));
}
if (this.splitPages) return documents;
if (documents.length === 0) return [];
return [new _langchain_core_documents.Document({
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: { pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: parsedPdf.numPages
} }
})];
}
async parseWithV2(raw, PDFParseClass) {
const parser = new PDFParseClass({ data: raw });
try {
const textResult = await parser.getText();
const infoResult = await parser.getInfo();
const documents = [];
for (const page of textResult.pages) {
if (!page.text || page.text.trim().length === 0) continue;
documents.push(new _langchain_core_documents.Document({
pageContent: page.text,
metadata: {
pdf: {
version: infoResult.metadata?.format || "unknown",
info: infoResult.info,
metadata: infoResult.metadata,
totalPages: textResult.total
},
loc: { pageNumber: page.num }
}
}));
}
if (this.splitPages) return documents;
if (documents.length === 0) return [];
return [new _langchain_core_documents.Document({
pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
metadata: { pdf: {
version: infoResult.metadata?.format || "unknown",
info: infoResult.info,
metadata: infoResult.metadata,
totalPages: textResult.total
} }
})];
} finally {
await parser.destroy();
}
}
};
async function PDFLoaderImports() {
try {
const pdfParseModule = await import("pdf-parse");
if ("PDFParse" in pdfParseModule) return {
isV2: true,
PDFParse: pdfParseModule.PDFParse
};
} catch {}
try {
const { default: mod } = await import(
/* @vite-ignore */
PDF_PARSE_V1_IMPORT_PATH
);
const { getDocument, version } = mod;
return {
isV2: false,
getDocument,
version
};
} catch (e) {
console.error(e);
throw new Error("Failed to load pdf-parse. Please install pdf-parse v1 or v2, e.g. `npm install pdf-parse@^1` or `npm install pdf-parse@^2`.");
}
}
//#endregion
exports.WebPDFLoader = WebPDFLoader;
Object.defineProperty(exports, "pdf_exports", {
enumerable: true,
get: function() {
return pdf_exports;
}
});
//# sourceMappingURL=pdf.cjs.map