@langchain/community
Version:
Third-party integrations for LangChain.js
219 lines (218 loc) • 8.9 kB
JavaScript
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
const require_runtime = require("../../_virtual/_rolldown/runtime.cjs");
let _langchain_core_utils_env = require("@langchain/core/utils/env");
let _langchain_core_documents = require("@langchain/core/documents");
let _langchain_core_document_loaders_base = require("@langchain/core/document_loaders/base");
let _langchain_classic_document_loaders_fs_directory = require("@langchain/classic/document_loaders/fs/directory");
//#region src/document_loaders/fs/unstructured.ts
var unstructured_exports = /* @__PURE__ */ require_runtime.__exportAll({
UNSTRUCTURED_API_FILETYPES: () => UNSTRUCTURED_API_FILETYPES,
UnknownHandling: () => _langchain_classic_document_loaders_fs_directory.UnknownHandling,
UnstructuredDirectoryLoader: () => UnstructuredDirectoryLoader,
UnstructuredLoader: () => UnstructuredLoader
});
const UNSTRUCTURED_API_FILETYPES = [
".txt",
".text",
".pdf",
".docx",
".doc",
".jpg",
".jpeg",
".eml",
".html",
".htm",
".md",
".pptx",
".ppt",
".msg",
".rtf",
".xlsx",
".xls",
".odt",
".epub"
];
/**
* A document loader that uses the Unstructured API to load unstructured
* documents. It supports both the new syntax with options object and the
* legacy syntax for backward compatibility. The load() method sends a
* partitioning request to the Unstructured API and retrieves the
* partitioned elements. It creates a Document instance for each element
* and returns an array of Document instances.
*
* It accepts either a filepath or an object containing a buffer and a filename
* as input.
*/
var UnstructuredLoader = class extends _langchain_core_document_loaders_base.BaseDocumentLoader {
filePath;
buffer;
fileName;
apiUrl = "https://api.unstructured.io/general/v0/general";
apiKey;
strategy = "hi_res";
encoding;
ocrLanguages = [];
coordinates;
pdfInferTableStructure;
xmlKeepTags;
skipInferTableTypes;
hiResModelName;
includePageBreaks;
chunkingStrategy;
multiPageSections;
combineUnderNChars;
newAfterNChars;
maxCharacters;
extractImageBlockTypes;
overlap;
overlapAll;
constructor(filepathOrBufferOptions, unstructuredOptions = {}) {
super();
const isLegacySyntax = typeof unstructuredOptions === "string";
if (typeof filepathOrBufferOptions === "object") {
this.buffer = filepathOrBufferOptions.buffer;
this.fileName = filepathOrBufferOptions.fileName;
} else if (isLegacySyntax) {
this.filePath = unstructuredOptions;
this.apiUrl = filepathOrBufferOptions;
} else this.filePath = filepathOrBufferOptions;
if (!isLegacySyntax) {
const options = unstructuredOptions;
this.apiKey = options.apiKey ?? (0, _langchain_core_utils_env.getEnvironmentVariable)("UNSTRUCTURED_API_KEY");
this.apiUrl = options.apiUrl ?? (0, _langchain_core_utils_env.getEnvironmentVariable)("UNSTRUCTURED_API_URL") ?? this.apiUrl;
this.strategy = options.strategy ?? this.strategy;
this.encoding = options.encoding;
this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages;
this.coordinates = options.coordinates;
this.pdfInferTableStructure = options.pdfInferTableStructure;
this.xmlKeepTags = options.xmlKeepTags;
this.skipInferTableTypes = options.skipInferTableTypes;
this.hiResModelName = options.hiResModelName;
this.includePageBreaks = options.includePageBreaks;
this.chunkingStrategy = options.chunkingStrategy;
this.multiPageSections = options.multiPageSections;
this.combineUnderNChars = options.combineUnderNChars;
this.newAfterNChars = options.newAfterNChars;
this.maxCharacters = options.maxCharacters;
this.extractImageBlockTypes = options.extractImageBlockTypes;
this.overlap = options.overlap;
this.overlapAll = options.overlapAll ?? false;
}
}
async _partition() {
let buffer = this.buffer;
let fileName = this.fileName;
if (!buffer) {
const { readFile, basename } = await this.imports();
buffer = await readFile(this.filePath);
fileName = basename(this.filePath);
}
const formData = new FormData();
formData.append("files", new Blob([buffer]), fileName);
formData.append("strategy", this.strategy);
this.ocrLanguages.forEach((language) => {
formData.append("ocr_languages", language);
});
if (this.encoding) formData.append("encoding", this.encoding);
if (this.coordinates === true) formData.append("coordinates", "true");
if (this.pdfInferTableStructure === true) formData.append("pdf_infer_table_structure", "true");
if (this.xmlKeepTags === true) formData.append("xml_keep_tags", "true");
if (this.skipInferTableTypes) formData.append("skip_infer_table_types", JSON.stringify(this.skipInferTableTypes));
if (this.hiResModelName) formData.append("hi_res_model_name", this.hiResModelName);
if (this.includePageBreaks) formData.append("include_page_breaks", "true");
if (this.chunkingStrategy) formData.append("chunking_strategy", this.chunkingStrategy);
if (this.multiPageSections !== void 0) formData.append("multipage_sections", this.multiPageSections ? "true" : "false");
if (this.combineUnderNChars !== void 0) formData.append("combine_under_n_chars", String(this.combineUnderNChars));
if (this.newAfterNChars !== void 0) formData.append("new_after_n_chars", String(this.newAfterNChars));
if (this.maxCharacters !== void 0) formData.append("max_characters", String(this.maxCharacters));
if (this.extractImageBlockTypes !== void 0) formData.append("extract_image_block_types", JSON.stringify(this.extractImageBlockTypes));
if (this.overlap !== void 0) formData.append("overlap", String(this.overlap));
if (this.overlapAll === true) formData.append("overlap_all", "true");
const headers = { "UNSTRUCTURED-API-KEY": this.apiKey ?? "" };
const response = await fetch(this.apiUrl, {
method: "POST",
body: formData,
headers
});
if (!response.ok) throw new Error(`Failed to partition file ${this.filePath} with error ${response.status} and message ${await response.text()}`);
const elements = await response.json();
if (!Array.isArray(elements)) throw new Error(`Expected partitioning request to return an array, but got ${elements}`);
return elements.filter((el) => typeof el.text === "string");
}
async load() {
const elements = await this._partition();
const documents = [];
for (const element of elements) {
const { metadata, text } = element;
if (typeof text === "string" && text !== "") documents.push(new _langchain_core_documents.Document({
pageContent: text,
metadata: {
...metadata,
category: element.type
}
}));
}
return documents;
}
async imports() {
try {
const { readFile } = await import("node:fs/promises");
const { basename } = await import("node:path");
return {
readFile,
basename
};
} catch (e) {
console.error(e);
throw new Error(`Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${(0, _langchain_core_utils_env.getEnv)()}'. See https://<link to docs> for alternatives.`);
}
}
};
/**
* A document loader that loads unstructured documents from a directory
* using the UnstructuredLoader. It creates a UnstructuredLoader instance
* for each supported file type and passes it to the DirectoryLoader
* constructor.
* @example
* ```typescript
* const loader = new UnstructuredDirectoryLoader("path/to/directory", {
* apiKey: "MY_API_KEY",
* });
* const docs = await loader.load();
* ```
*/
var UnstructuredDirectoryLoader = class extends _langchain_classic_document_loaders_fs_directory.DirectoryLoader {
constructor(directoryPathOrLegacyApiUrl, optionsOrLegacyDirectoryPath, legacyOptionRecursive = true, legacyOptionUnknown = _langchain_classic_document_loaders_fs_directory.UnknownHandling.Warn) {
let directoryPath;
let options;
if (typeof optionsOrLegacyDirectoryPath === "string") {
directoryPath = optionsOrLegacyDirectoryPath;
options = {
apiUrl: directoryPathOrLegacyApiUrl,
recursive: legacyOptionRecursive,
unknown: legacyOptionUnknown
};
} else {
directoryPath = directoryPathOrLegacyApiUrl;
options = optionsOrLegacyDirectoryPath;
}
const loader = (p) => new UnstructuredLoader(p, options);
const loaders = UNSTRUCTURED_API_FILETYPES.reduce((loadersObject, filetype) => {
loadersObject[filetype] = loader;
return loadersObject;
}, {});
super(directoryPath, loaders, options.recursive, options.unknown);
}
};
//#endregion
exports.UNSTRUCTURED_API_FILETYPES = UNSTRUCTURED_API_FILETYPES;
exports.UnknownHandling = _langchain_classic_document_loaders_fs_directory.UnknownHandling;
exports.UnstructuredDirectoryLoader = UnstructuredDirectoryLoader;
exports.UnstructuredLoader = UnstructuredLoader;
Object.defineProperty(exports, "unstructured_exports", {
enumerable: true,
get: function() {
return unstructured_exports;
}
});
//# sourceMappingURL=unstructured.cjs.map