@langchain/community
Version:
Third-party integrations for LangChain.js
77 lines (76 loc) • 2.72 kB
JavaScript
import { __exportAll } from "../../_virtual/_rolldown/runtime.js";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
//#region src/document_loaders/fs/epub.ts
var epub_exports = /* @__PURE__ */ __exportAll({ EPubLoader: () => EPubLoader });
/**
* A class that extends the `BaseDocumentLoader` class. It represents a
* document loader that loads documents from EPUB files.
*/
var EPubLoader = class extends BaseDocumentLoader {
splitChapters;
constructor(filePath, { splitChapters = true } = {}) {
super();
this.filePath = filePath;
this.splitChapters = splitChapters;
}
/**
* A protected method that takes an EPUB object as a parameter and returns
* a promise that resolves to an array of objects representing the content
* and metadata of each chapter.
* @param epub The EPUB object to parse.
* @returns A promise that resolves to an array of objects representing the content and metadata of each chapter.
*/
async parse(epub) {
const { htmlToText } = await HtmlToTextImport();
return (await Promise.all(epub.flow.map(async (chapter) => {
if (!chapter.id) return null;
const html = await epub.getChapterRawAsync(chapter.id);
if (!html) return null;
return {
html,
title: chapter.title
};
}))).filter(Boolean).map((chapter) => ({
pageContent: htmlToText(chapter.html),
metadata: { ...chapter.title && { chapter: chapter.title } }
}));
}
/**
* A method that loads the EPUB file and returns a promise that resolves
* to an array of `Document` instances.
* @returns A promise that resolves to an array of `Document` instances.
*/
async load() {
const { EPub } = await EpubImport();
const epub = await EPub.createAsync(this.filePath);
const parsed = await this.parse(epub);
const metadata = { source: this.filePath };
if (parsed.length === 0) return [];
return this.splitChapters ? parsed.map((chapter) => new Document({
pageContent: chapter.pageContent,
metadata: {
...metadata,
...chapter.metadata
}
})) : [new Document({
pageContent: parsed.map((chapter) => chapter.pageContent).join("\n\n"),
metadata
})];
}
};
async function EpubImport() {
const { EPub } = await import("epub2").catch(() => {
throw new Error("Failed to load epub2. Please install it with eg. `npm install epub2`.");
});
return { EPub };
}
async function HtmlToTextImport() {
const { htmlToText } = await import("html-to-text").catch(() => {
throw new Error("Failed to load html-to-text. Please install it with eg. `npm install html-to-text`.");
});
return { htmlToText };
}
//#endregion
export { EPubLoader, epub_exports };
//# sourceMappingURL=epub.js.map