UNPKG

flexbiz-server

Version:

Flexible Server

13 lines (12 loc) 4.61 kB
const fs=require("fs"),path=require("path"),PDFParser=require("pdf2json"),mammoth=require("mammoth"),xlsx=require("xlsx"),FileChunk=global.getModel("filechunk"),{getPathFile}=require("./utils"),StaticPool=require("../libs/WorkerStaticPool"); async function extractTextFromFile($filePath$$){const $ext$$=path.extname($filePath$$).toLowerCase();let $rawText$$="";switch($ext$$){case ".pdf":$rawText$$=await new Promise(($resolve$$,$reject$$)=>{const $pdfParser$$=new PDFParser(this,1);$pdfParser$$.on("pdfParser_dataError",$errData$$=>$reject$$($errData$$.parserError));$pdfParser$$.on("pdfParser_dataReady",()=>{$resolve$$($pdfParser$$.getRawTextContent())});$pdfParser$$.loadPDF($filePath$$)});break;case ".docx":$rawText$$=(await mammoth.extractRawText({path:$filePath$$})).value; break;case ".xlsx":case ".xls":const $workbook$$=xlsx.readFile($filePath$$);$workbook$$.SheetNames.forEach($sheetName$$=>{const $csvData$$=xlsx.utils.sheet_to_csv($workbook$$.Sheets[$sheetName$$]);$csvData$$.trim()&&($rawText$$+=`\n--- D\u1eef li\u1ec7u t\u1eeb Sheet: ${$sheetName$$} ---\n`,$rawText$$+=$csvData$$,$rawText$$+="\n-----------------------------------\n")});break;case ".txt":case ".md":case ".csv":$rawText$$=fs.readFileSync($filePath$$,"utf8");break;default:throw Error(`H\u1ec7 th\u1ed1ng ch\u01b0a h\u1ed7 tr\u1ee3 b\u00f3c t\u00e1ch n\u1ed9i dung cho \u0111\u1ecbnh d\u1ea1ng: ${$ext$$}`); }return $rawText$$} async function generateEmbedding($text$$){try{const $response$$=await fetch(`${process.env.OLLAMA_BASE_URL||"http://localhost:11434"}/api/embeddings`,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({model:"nomic-embed-text",prompt:$text$$})});if(!$response$$.ok)throw Error(`L\u1ed7i t\u1eeb Ollama API: ${$response$$.statusText}`);return(await $response$$.json()).embedding}catch($error$$){throw Logger.error("[vectorService] L\u1ed7i khi g\u1ecdi Ollama Embedding:",$error$$.message), Logger.warn("[vectorService] C\u00e0i \u0111\u1eb7t Ollama: ","\n docker run -d -v ollama_data:/root/.ollama -p 11434:11434 --name ollama_server ollama/ollama\n docker exec -it ollama_server ollama pull nomic-embed-text\n "),$error$$;}} function splitTextIntoChunks($cleanText_text$$,$chunkSize$$=1E3,$chunkOverlap$$=200){const $chunks$$=[];let $startIndex$$=0;for($cleanText_text$$=$cleanText_text$$.replace(/\s+/g," ").trim();$startIndex$$<$cleanText_text$$.length;){const $chunk$$=$cleanText_text$$.substring($startIndex$$,$startIndex$$+$chunkSize$$);$chunks$$.push($chunk$$);$startIndex$$+=$chunkSize$$-$chunkOverlap$$}return $chunks$$} async function processAndStoreFile($fileDoc$$){try{console.log(`[B\u1eaft \u0111\u1ea7u] X\u1eed l\u00fd file: ${$fileDoc$$.filename||$fileDoc$$._id}`);const $filePath$$=getPathFile($fileDoc$$);if(!fs.existsSync($filePath$$))throw Error(`Kh\u00f4ng t\u00ecm th\u1ea5y file t\u1ea1i \u0111\u01b0\u1eddng d\u1eabn: ${$filePath$$}`);const $rawText$$=await extractTextFromFile($filePath$$);if($rawText$$&&$rawText$$.trim()!==""){var $chunks$$=splitTextIntoChunks($rawText$$,1E3,200);console.log(`- C\u1eaft th\u00e0nh c\u00f4ng ${$chunks$$.length} chunks.`); var $chunkPromises$$=$chunks$$.map(async($chunkContent$$,$index$$)=>{const $vectorEmbedding$$=await generateEmbedding($chunkContent$$);return FileChunk.create({file_id:$fileDoc$$._id,chunk_index:$index$$,content:$chunkContent$$,metadata:{department:$fileDoc$$.department||"general",docType:$fileDoc$$.docType||"unknown",accessLevel:$fileDoc$$.accessLevel||"internal"},embedding:$vectorEmbedding$$})});await Promise.all($chunkPromises$$);console.log(`[Th\u00e0nh c\u00f4ng] \u0110\u00e3 vector h\u00f3a to\u00e0n b\u1ed9 d\u1eef li\u1ec7u c\u1ee7a file ${$fileDoc$$._id}`)}else console.warn(`[C\u1ea3nh b\u00e1o] File ${$filePath$$} tr\u1ed1ng ho\u1eb7c kh\u00f4ng c\u00f3 text \u0111\u1ec3 tr\u00edch xu\u1ea5t.`)}catch($error$$){console.error(`[L\u1ed7i] X\u1eed l\u00fd file ${$fileDoc$$._id} th\u1ea5t b\u1ea1i:`, $error$$.message)}} const processAndStoreFileWorker=$file$$=>{const $fileWorker$$=path.dirname(__dirname)+"/workers/inputWorker.js";global.createVectorContentFile||(global.createVectorContentFile=new StaticPool($fileWorker$$,0,1));return new Promise(($resolve$$,$reject$$)=>{global.createVectorContentFile.exec({processFile:$file$$,configs:{database:configs.database}},$response$$=>{if($response$$.error)return $reject$$({error:$response$$.error||"\u0110\u00e3 c\u00f3 l\u1ed7i khi x\u1eed l\u00fd d\u1eef li\u1ec7u"});$resolve$$($response$$)})})}; module.exports={processAndStoreFile,processAndStoreFileWorker,generateEmbedding,extractTextFromFile};