UNPKG

@langchain/community

Version:
1 lines 17.2 kB
{"version":3,"file":"unstructured.cjs","names":["BaseDocumentLoader","Document","DirectoryLoader","UnknownHandling"],"sources":["../../../src/document_loaders/fs/unstructured.ts"],"sourcesContent":["import type { basename as BasenameT } from \"node:path\";\nimport type { readFile as ReadFileT } from \"node:fs/promises\";\nimport { Document } from \"@langchain/core/documents\";\nimport { getEnv, getEnvironmentVariable } from \"@langchain/core/utils/env\";\nimport { StringWithAutocomplete } from \"@langchain/core/utils/types\";\nimport {\n DirectoryLoader,\n UnknownHandling,\n LoadersMapping,\n} from \"@langchain/classic/document_loaders/fs/directory\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\nexport const UNSTRUCTURED_API_FILETYPES = [\n \".txt\",\n \".text\",\n \".pdf\",\n \".docx\",\n \".doc\",\n \".jpg\",\n \".jpeg\",\n \".eml\",\n \".html\",\n \".htm\",\n \".md\",\n \".pptx\",\n \".ppt\",\n \".msg\",\n \".rtf\",\n \".xlsx\",\n \".xls\",\n \".odt\",\n \".epub\",\n];\n\n/**\n * Represents an element returned by the Unstructured API. It has\n * properties for the element type, text content, and metadata.\n */\ntype Element = {\n type: string;\n text: string;\n // this is purposefully loosely typed\n metadata: {\n [key: string]: unknown;\n };\n};\n\n/**\n * Represents the available strategies for the UnstructuredLoader. It can\n * be one of \"hi_res\", \"fast\", \"ocr_only\", or \"auto\".\n */\nexport type UnstructuredLoaderStrategy =\n | \"hi_res\"\n | \"fast\"\n | \"ocr_only\"\n | \"auto\";\n\n/**\n * Represents the available hi-res models for the UnstructuredLoader. It can\n * be one of \"chipper\".\n */\nexport type HiResModelName = \"chipper\";\n\n/**\n * To enable or disable table extraction for file types other than PDF, set\n * the skipInferTableTypes property in the UnstructuredLoaderOptions object.\n * The skipInferTableTypes property is an array of file types for which table\n * extraction is disabled. For example, to disable table extraction for .docx\n * and .doc files, set the skipInferTableTypes property to [\"docx\", \"doc\"].\n * You can also disable table extraction for all file types other than PDF by\n * setting the skipInferTableTypes property to [].\n */\nexport type SkipInferTableTypes =\n | \"txt\"\n | \"text\"\n | \"pdf\"\n | \"docx\"\n | \"doc\"\n | \"jpg\"\n | \"jpeg\"\n | \"eml\"\n | \"html\"\n | \"htm\"\n | \"md\"\n | \"pptx\"\n | \"ppt\"\n | \"msg\"\n | \"rtf\"\n | \"xlsx\"\n | \"xls\"\n | \"odt\"\n | \"epub\";\n\n/**\n * Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title\n */\nexport type ChunkingStrategy = \"None\" | \"by_title\";\n\nexport type UnstructuredLoaderOptions = {\n apiKey?: string;\n apiUrl?: string;\n strategy?: StringWithAutocomplete<UnstructuredLoaderStrategy>;\n encoding?: string;\n ocrLanguages?: Array<string>;\n coordinates?: boolean;\n pdfInferTableStructure?: boolean;\n xmlKeepTags?: boolean;\n skipInferTableTypes?: Array<StringWithAutocomplete<SkipInferTableTypes>>;\n hiResModelName?: StringWithAutocomplete<HiResModelName>;\n includePageBreaks?: boolean;\n chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>;\n multiPageSections?: boolean;\n combineUnderNChars?: number;\n newAfterNChars?: number;\n maxCharacters?: number;\n extractImageBlockTypes?: string[];\n overlap?: number;\n overlapAll?: boolean;\n};\n\nexport type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {\n recursive?: boolean;\n unknown?: UnknownHandling;\n};\n\nexport type UnstructuredMemoryLoaderOptions = {\n buffer: Buffer;\n fileName: string;\n};\n\n/**\n * A document loader that uses the Unstructured API to load unstructured\n * documents. It supports both the new syntax with options object and the\n * legacy syntax for backward compatibility. The load() method sends a\n * partitioning request to the Unstructured API and retrieves the\n * partitioned elements. It creates a Document instance for each element\n * and returns an array of Document instances.\n *\n * It accepts either a filepath or an object containing a buffer and a filename\n * as input.\n */\nexport class UnstructuredLoader extends BaseDocumentLoader {\n public filePath: string;\n\n private buffer?: Buffer;\n\n private fileName?: string;\n\n private apiUrl = \"https://api.unstructured.io/general/v0/general\";\n\n private apiKey?: string;\n\n private strategy: StringWithAutocomplete<UnstructuredLoaderStrategy> =\n \"hi_res\";\n\n private encoding?: string;\n\n private ocrLanguages: Array<string> = [];\n\n private coordinates?: boolean;\n\n private pdfInferTableStructure?: boolean;\n\n private xmlKeepTags?: boolean;\n\n private skipInferTableTypes?: Array<\n StringWithAutocomplete<SkipInferTableTypes>\n >;\n\n private hiResModelName?: StringWithAutocomplete<HiResModelName>;\n\n private includePageBreaks?: boolean;\n\n private chunkingStrategy?: StringWithAutocomplete<ChunkingStrategy>;\n\n private multiPageSections?: boolean;\n\n private combineUnderNChars?: number;\n\n private newAfterNChars?: number;\n\n private maxCharacters?: number;\n\n private extractImageBlockTypes?: string[];\n\n private overlap?: number;\n\n private overlapAll?: boolean;\n\n constructor(\n filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions,\n unstructuredOptions: UnstructuredLoaderOptions | string = {}\n ) {\n super();\n\n // Temporary shim to avoid breaking existing users\n // Remove when API keys are enforced by Unstructured and existing code will break anyway\n const isLegacySyntax = typeof unstructuredOptions === \"string\";\n const isMemorySyntax = typeof filepathOrBufferOptions === \"object\";\n\n if (isMemorySyntax) {\n this.buffer = filepathOrBufferOptions.buffer;\n this.fileName = filepathOrBufferOptions.fileName;\n } else if (isLegacySyntax) {\n this.filePath = unstructuredOptions;\n this.apiUrl = filepathOrBufferOptions;\n } else {\n this.filePath = filepathOrBufferOptions;\n }\n\n if (!isLegacySyntax) {\n const options = unstructuredOptions;\n this.apiKey =\n options.apiKey ?? getEnvironmentVariable(\"UNSTRUCTURED_API_KEY\");\n this.apiUrl =\n options.apiUrl ??\n getEnvironmentVariable(\"UNSTRUCTURED_API_URL\") ??\n this.apiUrl;\n this.strategy = options.strategy ?? this.strategy;\n this.encoding = options.encoding;\n this.ocrLanguages = options.ocrLanguages ?? this.ocrLanguages;\n this.coordinates = options.coordinates;\n this.pdfInferTableStructure = options.pdfInferTableStructure;\n this.xmlKeepTags = options.xmlKeepTags;\n this.skipInferTableTypes = options.skipInferTableTypes;\n this.hiResModelName = options.hiResModelName;\n this.includePageBreaks = options.includePageBreaks;\n this.chunkingStrategy = options.chunkingStrategy;\n this.multiPageSections = options.multiPageSections;\n this.combineUnderNChars = options.combineUnderNChars;\n this.newAfterNChars = options.newAfterNChars;\n this.maxCharacters = options.maxCharacters;\n this.extractImageBlockTypes = options.extractImageBlockTypes;\n this.overlap = options.overlap;\n this.overlapAll = options.overlapAll ?? false;\n }\n }\n\n async _partition() {\n let buffer = this.buffer;\n let fileName = this.fileName;\n\n if (!buffer) {\n const { readFile, basename } = await this.imports();\n\n buffer = await readFile(this.filePath);\n fileName = basename(this.filePath);\n\n // I'm aware this reads the file into memory first, but we have lots of work\n // to do on then consuming Documents in a streaming fashion anyway, so not\n // worried about this for now.\n }\n\n const formData = new FormData();\n formData.append(\"files\", new Blob([buffer]), fileName);\n formData.append(\"strategy\", this.strategy);\n this.ocrLanguages.forEach((language) => {\n formData.append(\"ocr_languages\", language);\n });\n if (this.encoding) {\n formData.append(\"encoding\", this.encoding);\n }\n if (this.coordinates === true) {\n formData.append(\"coordinates\", \"true\");\n }\n if (this.pdfInferTableStructure === true) {\n formData.append(\"pdf_infer_table_structure\", \"true\");\n }\n if (this.xmlKeepTags === true) {\n formData.append(\"xml_keep_tags\", \"true\");\n }\n if (this.skipInferTableTypes) {\n formData.append(\n \"skip_infer_table_types\",\n JSON.stringify(this.skipInferTableTypes)\n );\n }\n if (this.hiResModelName) {\n formData.append(\"hi_res_model_name\", this.hiResModelName);\n }\n if (this.includePageBreaks) {\n formData.append(\"include_page_breaks\", \"true\");\n }\n if (this.chunkingStrategy) {\n formData.append(\"chunking_strategy\", this.chunkingStrategy);\n }\n if (this.multiPageSections !== undefined) {\n formData.append(\n \"multipage_sections\",\n this.multiPageSections ? \"true\" : \"false\"\n );\n }\n if (this.combineUnderNChars !== undefined) {\n formData.append(\"combine_under_n_chars\", String(this.combineUnderNChars));\n }\n if (this.newAfterNChars !== undefined) {\n formData.append(\"new_after_n_chars\", String(this.newAfterNChars));\n }\n if (this.maxCharacters !== undefined) {\n formData.append(\"max_characters\", String(this.maxCharacters));\n }\n\n if (this.extractImageBlockTypes !== undefined) {\n formData.append(\n \"extract_image_block_types\",\n JSON.stringify(this.extractImageBlockTypes)\n );\n }\n\n if (this.overlap !== undefined) {\n formData.append(\"overlap\", String(this.overlap));\n }\n\n if (this.overlapAll === true) {\n formData.append(\"overlap_all\", \"true\");\n }\n\n const headers = {\n \"UNSTRUCTURED-API-KEY\": this.apiKey ?? \"\",\n };\n\n const response = await fetch(this.apiUrl, {\n method: \"POST\",\n body: formData,\n headers,\n });\n\n if (!response.ok) {\n throw new Error(\n `Failed to partition file ${this.filePath} with error ${\n response.status\n } and message ${await response.text()}`\n );\n }\n\n const elements = await response.json();\n if (!Array.isArray(elements)) {\n throw new Error(\n `Expected partitioning request to return an array, but got ${elements}`\n );\n }\n return elements.filter((el) => typeof el.text === \"string\") as Element[];\n }\n\n async load(): Promise<Document[]> {\n const elements = await this._partition();\n\n const documents: Document[] = [];\n for (const element of elements) {\n const { metadata, text } = element;\n if (typeof text === \"string\" && text !== \"\") {\n documents.push(\n new Document({\n pageContent: text,\n metadata: {\n ...metadata,\n category: element.type,\n },\n })\n );\n }\n }\n\n return documents;\n }\n\n async imports(): Promise<{\n readFile: typeof ReadFileT;\n basename: typeof BasenameT;\n }> {\n try {\n const { readFile } = await import(\"node:fs/promises\");\n const { basename } = await import(\"node:path\");\n return { readFile, basename };\n } catch (e) {\n console.error(e);\n throw new Error(\n `Failed to load fs/promises. TextLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.`\n );\n }\n }\n}\n\n/**\n * A document loader that loads unstructured documents from a directory\n * using the UnstructuredLoader. It creates a UnstructuredLoader instance\n * for each supported file type and passes it to the DirectoryLoader\n * constructor.\n * @example\n * ```typescript\n * const loader = new UnstructuredDirectoryLoader(\"path/to/directory\", {\n * apiKey: \"MY_API_KEY\",\n * });\n * const docs = await loader.load();\n * ```\n */\nexport class UnstructuredDirectoryLoader extends DirectoryLoader {\n constructor(\n directoryPathOrLegacyApiUrl: string,\n optionsOrLegacyDirectoryPath: UnstructuredDirectoryLoaderOptions | string,\n legacyOptionRecursive = true,\n legacyOptionUnknown: UnknownHandling = UnknownHandling.Warn\n ) {\n let directoryPath;\n let options: UnstructuredDirectoryLoaderOptions;\n // Temporary shim to avoid breaking existing users\n // Remove when API keys are enforced by Unstructured and existing code will break anyway\n const isLegacySyntax = typeof optionsOrLegacyDirectoryPath === \"string\";\n if (isLegacySyntax) {\n directoryPath = optionsOrLegacyDirectoryPath;\n options = {\n apiUrl: directoryPathOrLegacyApiUrl,\n recursive: legacyOptionRecursive,\n unknown: legacyOptionUnknown,\n };\n } else {\n directoryPath = directoryPathOrLegacyApiUrl;\n options = optionsOrLegacyDirectoryPath;\n }\n const loader = (p: string) => new UnstructuredLoader(p, options);\n const loaders = UNSTRUCTURED_API_FILETYPES.reduce(\n (loadersObject: LoadersMapping, filetype: string) => {\n loadersObject[filetype] = loader;\n return loadersObject;\n },\n {}\n );\n super(directoryPath, loaders, options.recursive, options.unknown);\n }\n}\n\nexport { UnknownHandling };\n"],"mappings":";;;;;;;;;;;;;AAYA,MAAa,6BAA6B;CACxC;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACD;;;;;;;;;;;;AA6GD,IAAa,qBAAb,cAAwCA,sCAAAA,mBAAmB;CACzD;CAEA;CAEA;CAEA,SAAiB;CAEjB;CAEA,WACE;CAEF;CAEA,eAAsC,EAAE;CAExC;CAEA;CAEA;CAEA;CAIA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA;CAEA,YACE,yBACA,sBAA0D,EAAE,EAC5D;AACA,SAAO;EAIP,MAAM,iBAAiB,OAAO,wBAAwB;AAGtD,MAFuB,OAAO,4BAA4B,UAEtC;AAClB,QAAK,SAAS,wBAAwB;AACtC,QAAK,WAAW,wBAAwB;aAC/B,gBAAgB;AACzB,QAAK,WAAW;AAChB,QAAK,SAAS;QAEd,MAAK,WAAW;AAGlB,MAAI,CAAC,gBAAgB;GACnB,MAAM,UAAU;AAChB,QAAK,SACH,QAAQ,WAAA,GAAA,0BAAA,wBAAiC,uBAAuB;AAClE,QAAK,SACH,QAAQ,WAAA,GAAA,0BAAA,wBACe,uBAAuB,IAC9C,KAAK;AACP,QAAK,WAAW,QAAQ,YAAY,KAAK;AACzC,QAAK,WAAW,QAAQ;AACxB,QAAK,eAAe,QAAQ,gBAAgB,KAAK;AACjD,QAAK,cAAc,QAAQ;AAC3B,QAAK,yBAAyB,QAAQ;AACtC,QAAK,cAAc,QAAQ;AAC3B,QAAK,sBAAsB,QAAQ;AACnC,QAAK,iBAAiB,QAAQ;AAC9B,QAAK,oBAAoB,QAAQ;AACjC,QAAK,mBAAmB,QAAQ;AAChC,QAAK,oBAAoB,QAAQ;AACjC,QAAK,qBAAqB,QAAQ;AAClC,QAAK,iBAAiB,QAAQ;AAC9B,QAAK,gBAAgB,QAAQ;AAC7B,QAAK,yBAAyB,QAAQ;AACtC,QAAK,UAAU,QAAQ;AACvB,QAAK,aAAa,QAAQ,cAAc;;;CAI5C,MAAM,aAAa;EACjB,IAAI,SAAS,KAAK;EAClB,IAAI,WAAW,KAAK;AAEpB,MAAI,CAAC,QAAQ;GACX,MAAM,EAAE,UAAU,aAAa,MAAM,KAAK,SAAS;AAEnD,YAAS,MAAM,SAAS,KAAK,SAAS;AACtC,cAAW,SAAS,KAAK,SAAS;;EAOpC,MAAM,WAAW,IAAI,UAAU;AAC/B,WAAS,OAAO,SAAS,IAAI,KAAK,CAAC,OAAO,CAAC,EAAE,SAAS;AACtD,WAAS,OAAO,YAAY,KAAK,SAAS;AAC1C,OAAK,aAAa,SAAS,aAAa;AACtC,YAAS,OAAO,iBAAiB,SAAS;IAC1C;AACF,MAAI,KAAK,SACP,UAAS,OAAO,YAAY,KAAK,SAAS;AAE5C,MAAI,KAAK,gBAAgB,KACvB,UAAS,OAAO,eAAe,OAAO;AAExC,MAAI,KAAK,2BAA2B,KAClC,UAAS,OAAO,6BAA6B,OAAO;AAEtD,MAAI,KAAK,gBAAgB,KACvB,UAAS,OAAO,iBAAiB,OAAO;AAE1C,MAAI,KAAK,oBACP,UAAS,OACP,0BACA,KAAK,UAAU,KAAK,oBAAoB,CACzC;AAEH,MAAI,KAAK,eACP,UAAS,OAAO,qBAAqB,KAAK,eAAe;AAE3D,MAAI,KAAK,kBACP,UAAS,OAAO,uBAAuB,OAAO;AAEhD,MAAI,KAAK,iBACP,UAAS,OAAO,qBAAqB,KAAK,iBAAiB;AAE7D,MAAI,KAAK,sBAAsB,KAAA,EAC7B,UAAS,OACP,sBACA,KAAK,oBAAoB,SAAS,QACnC;AAEH,MAAI,KAAK,uBAAuB,KAAA,EAC9B,UAAS,OAAO,yBAAyB,OAAO,KAAK,mBAAmB,CAAC;AAE3E,MAAI,KAAK,mBAAmB,KAAA,EAC1B,UAAS,OAAO,qBAAqB,OAAO,KAAK,eAAe,CAAC;AAEnE,MAAI,KAAK,kBAAkB,KAAA,EACzB,UAAS,OAAO,kBAAkB,OAAO,KAAK,cAAc,CAAC;AAG/D,MAAI,KAAK,2BAA2B,KAAA,EAClC,UAAS,OACP,6BACA,KAAK,UAAU,KAAK,uBAAuB,CAC5C;AAGH,MAAI,KAAK,YAAY,KAAA,EACnB,UAAS,OAAO,WAAW,OAAO,KAAK,QAAQ,CAAC;AAGlD,MAAI,KAAK,eAAe,KACtB,UAAS,OAAO,eAAe,OAAO;EAGxC,MAAM,UAAU,EACd,wBAAwB,KAAK,UAAU,IACxC;EAED,MAAM,WAAW,MAAM,MAAM,KAAK,QAAQ;GACxC,QAAQ;GACR,MAAM;GACN;GACD,CAAC;AAEF,MAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MACR,4BAA4B,KAAK,SAAS,cACxC,SAAS,OACV,eAAe,MAAM,SAAS,MAAM,GACtC;EAGH,MAAM,WAAW,MAAM,SAAS,MAAM;AACtC,MAAI,CAAC,MAAM,QAAQ,SAAS,CAC1B,OAAM,IAAI,MACR,6DAA6D,WAC9D;AAEH,SAAO,SAAS,QAAQ,OAAO,OAAO,GAAG,SAAS,SAAS;;CAG7D,MAAM,OAA4B;EAChC,MAAM,WAAW,MAAM,KAAK,YAAY;EAExC,MAAM,YAAwB,EAAE;AAChC,OAAK,MAAM,WAAW,UAAU;GAC9B,MAAM,EAAE,UAAU,SAAS;AAC3B,OAAI,OAAO,SAAS,YAAY,SAAS,GACvC,WAAU,KACR,IAAIC,0BAAAA,SAAS;IACX,aAAa;IACb,UAAU;KACR,GAAG;KACH,UAAU,QAAQ;KACnB;IACF,CAAC,CACH;;AAIL,SAAO;;CAGT,MAAM,UAGH;AACD,MAAI;GACF,MAAM,EAAE,aAAa,MAAM,OAAO;GAClC,MAAM,EAAE,aAAa,MAAM,OAAO;AAClC,UAAO;IAAE;IAAU;IAAU;WACtB,GAAG;AACV,WAAQ,MAAM,EAAE;AAChB,SAAM,IAAI,MACR,yHAAA,GAAA,0BAAA,SAAgI,CAAC,iDAClI;;;;;;;;;;;;;;;;;AAkBP,IAAa,8BAAb,cAAiDC,iDAAAA,gBAAgB;CAC/D,YACE,6BACA,8BACA,wBAAwB,MACxB,sBAAuCC,iDAAAA,gBAAgB,MACvD;EACA,IAAI;EACJ,IAAI;AAIJ,MADuB,OAAO,iCAAiC,UAC3C;AAClB,mBAAgB;AAChB,aAAU;IACR,QAAQ;IACR,WAAW;IACX,SAAS;IACV;SACI;AACL,mBAAgB;AAChB,aAAU;;EAEZ,MAAM,UAAU,MAAc,IAAI,mBAAmB,GAAG,QAAQ;EAChE,MAAM,UAAU,2BAA2B,QACxC,eAA+B,aAAqB;AACnD,iBAAc,YAAY;AAC1B,UAAO;KAET,EAAE,CACH;AACD,QAAM,eAAe,SAAS,QAAQ,WAAW,QAAQ,QAAQ"}