@tonytruong/chatbot-ai-lib
Version:
AI-powered healthcare automation, document parsing, OpenAI, embeddings, RAG, vector DB, Facebook OAuth.
100 lines • 4.24 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.smartChunkText = smartChunkText;
exports.readAndChunk = readAndChunk;
exports.chunkText = chunkText;
const tslib_1 = require("tslib");
const fs_1 = tslib_1.__importDefault(require("fs"));
const pdf_parse_1 = tslib_1.__importDefault(require("pdf-parse"));
const mammoth_1 = tslib_1.__importDefault(require("mammoth"));
const xlsx_1 = tslib_1.__importDefault(require("xlsx"));
const sentence_splitter_1 = tslib_1.__importDefault(require("sentence-splitter"));
const logger_1 = require("../../../shared/logger");
function smartChunkText(text, chunkSize = 2000) {
if (!text || typeof text !== "string")
return [];
let sentences = [];
try {
const splitResult = sentence_splitter_1.default.split(text);
if (!Array.isArray(splitResult))
return [];
sentences = splitResult;
}
catch (err) {
sentences = text.split(".").map((s) => s + ".");
}
if (!Array.isArray(sentences) || sentences.length === 0)
return [];
const rawSentences = typeof sentences[0] === "string"
? sentences
: sentences.map((s) => s.raw);
const chunks = [];
let current = "";
for (const sentence of rawSentences) {
if ((current + sentence).length > chunkSize) {
if (current)
chunks.push(current);
current = "";
}
current += sentence;
}
if (current)
chunks.push(current);
return chunks;
}
function readAndChunk(filePath, fileName) {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
if (!fileName || typeof fileName !== "string") {
logger_1.logger.error("readAndChunk: fileName is invalid", new Error(String(fileName)));
throw Object.assign(new Error("Invalid fileName: " + fileName), {
status: 400,
});
}
if (!filePath || typeof filePath !== "string") {
logger_1.logger.error("readAndChunk: filePath is invalid", new Error(String(filePath)));
throw Object.assign(new Error("Invalid filePath: " + filePath), {
status: 400,
});
}
const parts = fileName.split(".");
if (!parts || parts.length < 2) {
logger_1.logger.error("File name does not have an extension: " + fileName);
throw Object.assign(new Error("File name does not have an extension: " + fileName), { status: 400 });
}
const ext = parts.pop().toLowerCase();
let text = "";
if (ext === "pdf") {
const dataBuffer = fs_1.default.readFileSync(filePath);
const data = yield (0, pdf_parse_1.default)(dataBuffer);
text = data.text;
}
else if (ext === "txt") {
text = fs_1.default.readFileSync(filePath, "utf-8");
}
else if (ext === "docx") {
const data = fs_1.default.readFileSync(filePath);
const result = yield mammoth_1.default.extractRawText({ buffer: data });
text = result.value;
}
else if (ext === "xlsx") {
const workbook = xlsx_1.default.readFile(filePath);
text = workbook.SheetNames.map((name) => xlsx_1.default.utils.sheet_to_csv(workbook.Sheets[name])).join("\n");
}
else {
throw new Error("Unsupported file type: " + ext);
}
if (!text || typeof text !== "string" || !text.trim()) {
logger_1.logger.error(`No text extracted from file: ${fileName} ext: ${ext}`);
throw Object.assign(new Error(`Không thể trích xuất nội dung từ file: ${fileName} (loại: ${ext}) - Có thể file PDF là scan hoặc không có text layer, hoặc file bị lỗi.`), { status: 400 });
}
return smartChunkText(text);
});
}
function chunkText(text, chunkSize = 1500) {
const chunks = [];
for (let i = 0; i < text.length; i += chunkSize) {
chunks.push(text.slice(i, i + chunkSize));
}
return chunks;
}
//# sourceMappingURL=FileService.js.map