UNPKG

@lobehub/chat

Version:

Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.

174 lines (151 loc) 4.87 kB
import { ChunkingLoader } from 'src/libs/langchain'; import { Strategy } from 'unstructured-client/sdk/models/shared'; import { knowledgeEnv } from '@/config/knowledge'; import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; import { ChunkingRuleParser } from './rules'; import type { ChunkingService } from './rules'; export interface ChunkContentParams { content: Uint8Array; fileType: string; filename: string; mode?: 'fast' | 'hi-res'; } interface ChunkResult { chunks: NewChunkItem[]; unstructuredChunks?: NewUnstructuredChunkItem[]; } export class ContentChunk { private unstructuredClient: Unstructured; private langchainClient: ChunkingLoader; private chunkingRules: Record<string, ChunkingService[]>; constructor() { this.unstructuredClient = new Unstructured(); this.langchainClient = new ChunkingLoader(); this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || ''); } private getChunkingServices(fileType: string): ChunkingService[] { const ext = fileType.split('/').pop()?.toLowerCase() || ''; return this.chunkingRules[ext] || ['default']; } async chunkContent(params: ChunkContentParams): Promise<ChunkResult> { const services = this.getChunkingServices(params.fileType); for (const service of services) { try { switch (service) { case 'unstructured': { if (this.canUseUnstructured()) { return await this.chunkByUnstructured(params.filename, params.content); } break; } case 'doc2x': { // Future implementation break; } default: { return await this.chunkByLangChain(params.filename, params.content); } } } catch (error) { // If this is the last service, throw the error if (service === services.at(-1)) throw error; // Otherwise continue to next service console.error(`Chunking failed with service ${service}:`, error); } } // Fallback to langchain if no service succeeded return await this.chunkByLangChain(params.filename, params.content); } private canUseUnstructured(): boolean { return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL); } private chunkByUnstructured = async ( filename: string, content: Uint8Array, ): Promise<ChunkResult> => { const result = await this.unstructuredClient.partition({ chunkingStrategy: ChunkingStrategy.ByPage, fileContent: content, filename, strategy: Strategy.Auto, }); // after finish partition, we need to filter out some elements const documents = result.compositeElements .filter((e) => !new Set(['PageNumber', 'Footer']).has(e.type)) .map((item, index): NewChunkItem => { const { text_as_html, page_number, page_name, image_mime_type, image_base64, parent_id, languages, coordinates, } = item.metadata; return { id: item.element_id, index, metadata: { coordinates, image_base64, image_mime_type, languages, page_name, page_number, parent_id, text_as_html, }, text: item.text, type: item.type, }; }); const chunks = result.originElements .filter((e) => !new Set(['PageNumber', 'Footer']).has(e.type)) .map((item, index): NewUnstructuredChunkItem => { const { text_as_html, page_number, page_name, image_mime_type, image_base64, parent_id, languages, coordinates, } = item.metadata; return { compositeId: item.compositeId, id: item.element_id, index, metadata: { coordinates, image_base64, image_mime_type, languages, page_name, page_number, text_as_html, }, parentId: parent_id, text: item.text, type: item.type, }; }); return { chunks: documents, unstructuredChunks: chunks }; }; private chunkByLangChain = async ( filename: string, content: Uint8Array, ): Promise<ChunkResult> => { const res = await this.langchainClient.partitionContent(filename, content); const documents = res.map((item, index) => ({ id: item.id, index, metadata: item.metadata, text: item.pageContent, type: 'LangChainElement', })); return { chunks: documents }; }; }