UNPKG

@tonytruong/chatbot-ai-lib

Version:

AI-powered healthcare automation, document parsing, OpenAI, embeddings, RAG, vector DB, Facebook OAuth.

100 lines 4.24 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.smartChunkText = smartChunkText; exports.readAndChunk = readAndChunk; exports.chunkText = chunkText; const tslib_1 = require("tslib"); const fs_1 = tslib_1.__importDefault(require("fs")); const pdf_parse_1 = tslib_1.__importDefault(require("pdf-parse")); const mammoth_1 = tslib_1.__importDefault(require("mammoth")); const xlsx_1 = tslib_1.__importDefault(require("xlsx")); const sentence_splitter_1 = tslib_1.__importDefault(require("sentence-splitter")); const logger_1 = require("../../../shared/logger"); function smartChunkText(text, chunkSize = 2000) { if (!text || typeof text !== "string") return []; let sentences = []; try { const splitResult = sentence_splitter_1.default.split(text); if (!Array.isArray(splitResult)) return []; sentences = splitResult; } catch (err) { sentences = text.split(".").map((s) => s + "."); } if (!Array.isArray(sentences) || sentences.length === 0) return []; const rawSentences = typeof sentences[0] === "string" ? sentences : sentences.map((s) => s.raw); const chunks = []; let current = ""; for (const sentence of rawSentences) { if ((current + sentence).length > chunkSize) { if (current) chunks.push(current); current = ""; } current += sentence; } if (current) chunks.push(current); return chunks; } function readAndChunk(filePath, fileName) { return tslib_1.__awaiter(this, void 0, void 0, function* () { if (!fileName || typeof fileName !== "string") { logger_1.logger.error("readAndChunk: fileName is invalid", new Error(String(fileName))); throw Object.assign(new Error("Invalid fileName: " + fileName), { status: 400, }); } if (!filePath || typeof filePath !== "string") { logger_1.logger.error("readAndChunk: filePath is invalid", new Error(String(filePath))); throw Object.assign(new Error("Invalid filePath: " + filePath), { status: 400, }); } const parts = fileName.split("."); if (!parts || parts.length < 2) { logger_1.logger.error("File name does not have an extension: " + fileName); throw Object.assign(new Error("File name does not have an extension: " + fileName), { status: 400 }); } const ext = parts.pop().toLowerCase(); let text = ""; if (ext === "pdf") { const dataBuffer = fs_1.default.readFileSync(filePath); const data = yield (0, pdf_parse_1.default)(dataBuffer); text = data.text; } else if (ext === "txt") { text = fs_1.default.readFileSync(filePath, "utf-8"); } else if (ext === "docx") { const data = fs_1.default.readFileSync(filePath); const result = yield mammoth_1.default.extractRawText({ buffer: data }); text = result.value; } else if (ext === "xlsx") { const workbook = xlsx_1.default.readFile(filePath); text = workbook.SheetNames.map((name) => xlsx_1.default.utils.sheet_to_csv(workbook.Sheets[name])).join("\n"); } else { throw new Error("Unsupported file type: " + ext); } if (!text || typeof text !== "string" || !text.trim()) { logger_1.logger.error(`No text extracted from file: ${fileName} ext: ${ext}`); throw Object.assign(new Error(`Không thể trích xuất nội dung từ file: ${fileName} (loại: ${ext}) - Có thể file PDF là scan hoặc không có text layer, hoặc file bị lỗi.`), { status: 400 }); } return smartChunkText(text); }); } function chunkText(text, chunkSize = 1500) { const chunks = []; for (let i = 0; i < text.length; i += chunkSize) { chunks.push(text.slice(i, i + chunkSize)); } return chunks; } //# sourceMappingURL=FileService.js.map