@langchain/community
Version:
Third-party integrations for LangChain.js
91 lines (90 loc) • 3.05 kB
JavaScript
import { test, expect } from "@jest/globals";
import * as url from "node:url";
import * as path from "node:path";
import * as fs from "node:fs/promises";
import { WebPDFLoader } from "../web/pdf.js";
test("Test Web PDF loader from blob", async () => {
const filePath = path.resolve(path.dirname(url.fileURLToPath(import.meta.url)), "./example_data/1706.03762.pdf");
const loader = new WebPDFLoader(new Blob([await fs.readFile(filePath)], {
type: "application/pdf",
}));
const docs = await loader.load();
expect(docs.length).toBe(15);
expect(docs[0].pageContent).toContain("Attention Is All You Need");
expect(docs[0].metadata).toMatchInlineSnapshot(`
{
"loc": {
"pageNumber": 1,
},
"pdf": {
"info": {
"Author": "",
"CreationDate": "D:20171207010315Z",
"Creator": "LaTeX with hyperref package",
"IsAcroFormPresent": false,
"IsXFAPresent": false,
"Keywords": "",
"ModDate": "D:20171207010315Z",
"PDFFormatVersion": "1.5",
"Producer": "pdfTeX-1.40.17",
"Subject": "",
"Title": "",
"Trapped": {
"name": "False",
},
},
"metadata": null,
"totalPages": 15,
"version": "1.10.100",
},
}
`);
});
test("Test Web PDF loader with custom pdfjs", async () => {
const filePath = path.resolve(path.dirname(url.fileURLToPath(import.meta.url)), "./example_data/1706.03762.pdf");
const loader = new WebPDFLoader(new Blob([await fs.readFile(filePath)], {
type: "application/pdf",
}), {
pdfjs: () => import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"),
});
const docs = await loader.load();
expect(docs.length).toBe(15);
expect(docs[0].pageContent).toContain("Attention Is All You Need");
expect(docs[0].metadata).toMatchInlineSnapshot(`
{
"loc": {
"pageNumber": 1,
},
"pdf": {
"info": {
"Author": "",
"CreationDate": "D:20171207010315Z",
"Creator": "LaTeX with hyperref package",
"IsAcroFormPresent": false,
"IsXFAPresent": false,
"Keywords": "",
"ModDate": "D:20171207010315Z",
"PDFFormatVersion": "1.5",
"Producer": "pdfTeX-1.40.17",
"Subject": "",
"Title": "",
"Trapped": {
"name": "False",
},
},
"metadata": null,
"totalPages": 15,
"version": "1.10.100",
},
}
`);
});
test("Test Web PDF loader lines", async () => {
const filePath = path.resolve(path.dirname(url.fileURLToPath(import.meta.url)), "./example_data/Jacob_Lee_Resume_2023.pdf");
const loader = new WebPDFLoader(new Blob([await fs.readFile(filePath)], {
type: "application/pdf",
}), { splitPages: false });
const docs = await loader.load();
expect(docs.length).toBe(1);
expect(docs[0].pageContent.split("\n").length).toBeLessThan(100);
});