documind2
Version:
AI-powered document extraction.
184 lines (183 loc) • 7.7 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.documind = void 0;
const os_1 = __importDefault(require("os"));
const path_1 = __importDefault(require("path"));
const fs_extra_1 = __importDefault(require("fs-extra"));
const p_limit_1 = __importDefault(require("p-limit"));
const openAI_1 = require("./openAI");
const types_1 = require("./types");
const utils_1 = require("./utils");
const utils_2 = require("./utils");
const documind = async ({ cleanup = true, concurrency = 10, filePath, llmParams = {}, maintainFormat = false, model, //= ModelOptions.gpt_4o_mini,
openaiAPIKey = "", outputDir, pagesToConvertAsImages = -1, tempDir = os_1.default.tmpdir(), }) => {
const baseUrl = process.env.BASE_URL || "https://api.openai.com/v1";
const defaultModel = model ??
(baseUrl !== "https://api.openai.com/v1"
? types_1.ModelOptions.llava // Default for custom base URL
: types_1.ModelOptions.gpt_4o_mini); // Default for OpenAI
let inputTokenCount = 0;
let outputTokenCount = 0;
let priorPage = "";
const aggregatedMarkdown = [];
const startTime = new Date();
llmParams = (0, utils_2.validateLLMParams)(llmParams);
// Validators
if (!openaiAPIKey || !openaiAPIKey.length) {
throw new Error("Missing OpenAI API key");
}
if (!filePath || !filePath.length) {
throw new Error("Missing file path");
}
// Ensure temp directory exists + create temp folder
const rand = Math.floor(1000 + Math.random() * 9000).toString();
const tempDirectory = path_1.default.join(tempDir || os_1.default.tmpdir(), `documind-file-${rand}`);
await fs_extra_1.default.ensureDir(tempDirectory);
// Download the PDF. Get file name.
const { extension, localPath } = await (0, utils_1.downloadFile)({
filePath,
tempDir: tempDirectory,
});
if (!localPath)
throw "Failed to save file to local drive";
// Sort the `pagesToConvertAsImages` array to make sure we use the right index
// for `formattedPages` as `pdf2pic` always returns images in order
if (Array.isArray(pagesToConvertAsImages)) {
pagesToConvertAsImages.sort((a, b) => a - b);
}
// Convert file to PDF if necessary
if (extension !== ".png") {
let pdfPath;
if (extension === ".pdf") {
pdfPath = localPath;
}
else {
pdfPath = await (0, utils_1.convertFileToPdf)({
extension,
localPath,
tempDir: tempDirectory,
});
}
// Convert the file to a series of images
await (0, utils_1.convertPdfToImages)({
localPath: pdfPath,
pagesToConvertAsImages,
tempDir: tempDirectory,
});
}
const endOfPath = localPath.split("/")[localPath.split("/").length - 1];
const rawFileName = endOfPath.split(".")[0];
const fileName = rawFileName
.replace(/[^\w\s]/g, "")
.replace(/\s+/g, "_")
.toLowerCase()
.substring(0, 255); // Truncate file name to 255 characters to prevent ENAMETOOLONG errors
// Get list of converted images
const files = await fs_extra_1.default.readdir(tempDirectory);
const images = files.filter((file) => file.endsWith(".png"));
if (maintainFormat) {
// Use synchronous processing
for (const image of images) {
const imagePath = path_1.default.join(tempDirectory, image);
try {
const { content, inputTokens, outputTokens } = await (0, openAI_1.getCompletion)({
apiKey: openaiAPIKey,
imagePath,
llmParams,
maintainFormat,
model: defaultModel,
priorPage,
});
const formattedMarkdown = (0, utils_1.formatMarkdown)(content);
inputTokenCount += inputTokens;
outputTokenCount += outputTokens;
// Update prior page to result from last processing step
priorPage = formattedMarkdown;
// Add all markdown results to array
aggregatedMarkdown.push(formattedMarkdown);
}
catch (error) {
console.error(`Failed to process image ${image}:`, error);
throw error;
}
}
}
else {
// Process in parallel with a limit on concurrent pages
const processPage = async (image) => {
const imagePath = path_1.default.join(tempDirectory, image);
try {
const { content, inputTokens, outputTokens } = await (0, openAI_1.getCompletion)({
apiKey: openaiAPIKey,
imagePath,
llmParams,
maintainFormat,
model: defaultModel,
priorPage,
});
const formattedMarkdown = (0, utils_1.formatMarkdown)(content);
inputTokenCount += inputTokens;
outputTokenCount += outputTokens;
// Update prior page to result from last processing step
priorPage = formattedMarkdown;
// Add all markdown results to array
return formattedMarkdown;
}
catch (error) {
console.error(`Failed to process image ${image}:`, error);
throw error;
}
};
// Function to process pages with concurrency limit
const processPagesInBatches = async (images, limit) => {
const results = [];
const promises = images.map((image, index) => limit(() => processPage(image).then((result) => {
results[index] = result;
})));
await Promise.all(promises);
return results;
};
const limit = (0, p_limit_1.default)(concurrency);
const results = await processPagesInBatches(images, limit);
const filteredResults = results.filter(utils_1.isString);
aggregatedMarkdown.push(...filteredResults);
}
// Write the aggregated markdown to a file
if (outputDir) {
const resultFilePath = path_1.default.join(outputDir, `${fileName}.md`);
await fs_extra_1.default.writeFile(resultFilePath, aggregatedMarkdown.join("\n\n"));
}
// Cleanup the downloaded PDF file
if (cleanup)
await fs_extra_1.default.remove(tempDirectory);
// Format JSON response
const endTime = new Date();
const completionTime = endTime.getTime() - startTime.getTime();
const formattedPages = aggregatedMarkdown.map((el, i) => {
let pageNumber;
// If we convert all pages, just use the array index
if (pagesToConvertAsImages === -1) {
pageNumber = i + 1;
}
// Else if we convert specific pages, use the page number from the parameter
else if (Array.isArray(pagesToConvertAsImages)) {
pageNumber = pagesToConvertAsImages[i];
}
// Else, the parameter is a number and use it for the page number
else {
pageNumber = pagesToConvertAsImages;
}
return { content: el, page: pageNumber, contentLength: el.length };
});
return {
completionTime,
fileName,
inputTokens: inputTokenCount,
outputTokens: outputTokenCount,
pages: formattedPages,
};
};
exports.documind = documind;