documind2
Version:
AI-powered document extraction.
295 lines (253 loc) • 8.28 kB
text/typescript
import { pipeline } from "node:stream/promises";
import path from "path";
import { promisify } from "util";
import axios from "axios";
import fs from "fs-extra";
import { convert } from "libreoffice-convert";
import mime from "mime-types";
import { fromPath } from "pdf2pic";
import sharp from "sharp";
import * as Tesseract from "tesseract.js";
import type { LLMParams } from "./types";
const convertAsync = promisify(convert);
const defaultLLMParams: LLMParams = {
frequencyPenalty: 0, // OpenAI defaults to 0
maxTokens: 2000,
presencePenalty: 0, // OpenAI defaults to 0
temperature: 0,
topP: 1, // OpenAI defaults to 1
};
export const validateLLMParams = (params: Partial<LLMParams>): LLMParams => {
const validKeys = Object.keys(defaultLLMParams);
for (const [key, value] of Object.entries(params)) {
if (!validKeys.includes(key)) {
throw new Error(`Invalid LLM parameter: ${key}`);
}
if (typeof value !== "number") {
throw new Error(`Value for '${key}' must be a number`);
}
}
return { ...defaultLLMParams, ...params };
};
export const encodeImageToBase64 = async (imagePath: string) => {
const imageBuffer = await fs.readFile(imagePath);
return imageBuffer.toString("base64");
};
// Strip out the ```markdown wrapper
export const formatMarkdown = (text: string) => {
let formattedMarkdown = text?.trim();
let loopCount = 0;
const maxLoops = 3;
const startsWithMarkdown = formattedMarkdown.startsWith("```markdown");
while (startsWithMarkdown && loopCount < maxLoops) {
const endsWithClosing = formattedMarkdown.endsWith("```");
if (startsWithMarkdown && endsWithClosing) {
const outermostBlockRegex = /^```markdown\n([\s\S]*?)\n```$/;
const match = outermostBlockRegex.exec(formattedMarkdown);
if (match) {
formattedMarkdown = match[1].trim();
loopCount++;
} else {
break;
}
} else {
break;
}
}
return formattedMarkdown;
};
export const isString = (value: string | null): value is string => {
return value !== null;
};
export const isValidUrl = (string: string): boolean => {
let url;
try {
url = new URL(string);
} catch (_) {
return false;
}
return url.protocol === "http:" || url.protocol === "https:";
};
// Save file to local tmp directory
export const downloadFile = async ({
filePath,
tempDir,
}: {
filePath: string;
tempDir: string;
}): Promise<{ extension: string; localPath: string }> => {
// Shorten the file name by removing URL parameters
const baseFileName = path.basename(filePath.split("?")[0]);
const localPath = path.join(tempDir, baseFileName);
let mimetype;
// Check if filePath is a URL
if (isValidUrl(filePath)) {
const writer = fs.createWriteStream(localPath);
const response = await axios({
url: filePath,
method: "GET",
responseType: "stream",
});
if (response.status !== 200) {
throw new Error(`HTTP error! Status: ${response.status}`);
}
mimetype = response.headers?.["content-type"];
await pipeline(response.data, writer);
} else {
// If filePath is a local file, copy it to the temp directory
await fs.copyFile(filePath, localPath);
}
if (!mimetype) {
mimetype = mime.lookup(localPath);
}
let extension = mime.extension(mimetype) || "";
if (!extension) {
if (mimetype === "binary/octet-stream") {
extension = ".bin";
} else {
throw new Error("File extension missing");
}
}
if (!extension.startsWith(".")) {
extension = `.${extension}`;
}
return { extension, localPath };
};
// Extract text confidence from image buffer using Tesseract
export const getTextFromImage = async (
buffer: Buffer,
): Promise<{ confidence: number }> => {
try {
// Get image and metadata
const image = sharp(buffer);
const metadata = await image.metadata();
// Crop to a 150px wide column in the center of the document.
// This section produced the highest confidence/speed tradeoffs.
const cropWidth = 150;
const cropHeight = metadata.height || 0;
const left = Math.max(0, Math.floor((metadata.width! - cropWidth) / 2));
const top = 0;
// Extract the cropped image
const croppedBuffer = await image
.extract({ left, top, width: cropWidth, height: cropHeight })
.toBuffer();
// Pass the croppedBuffer to Tesseract.recognize
// @TODO: How can we generalize this to non eng languages?
const {
data: { confidence },
} = await Tesseract.recognize(croppedBuffer, "eng");
return { confidence };
} catch (error) {
console.error("Error during OCR:", error);
return { confidence: 0 };
}
};
// Correct image orientation based on OCR confidence
// Run Tesseract on 4 different orientations of the image and compare the output
const correctImageOrientation = async (buffer: Buffer): Promise<Buffer> => {
const image = sharp(buffer);
const rotations = [0, 90, 180, 270];
const results = await Promise.all(
rotations.map(async (rotation) => {
const rotatedImageBuffer = await image
.clone()
.rotate(rotation)
.toBuffer();
const { confidence } = await getTextFromImage(rotatedImageBuffer);
return { rotation, confidence };
}),
);
// Find the rotation with the best confidence score
const bestResult = results.reduce((best, current) =>
current.confidence > best.confidence ? current : best,
);
if (bestResult.rotation !== 0) {
console.log(
`Reorienting image ${bestResult.rotation} degrees (Confidence: ${bestResult.confidence}%).`,
);
}
// Rotate the image to the best orientation
const correctedImageBuffer = await image
.rotate(bestResult.rotation)
.toBuffer();
return correctedImageBuffer;
};
// Convert each page to a png, correct orientation, and save that image to tmp
export const convertPdfToImages = async ({
localPath,
pagesToConvertAsImages,
tempDir,
}: {
localPath: string;
pagesToConvertAsImages: number | number[];
tempDir: string;
}) => {
const options = {
density: 300,
format: "png",
height: 1056,
preserveAspectRatio: true,
saveFilename: path.basename(localPath, path.extname(localPath)),
savePath: tempDir,
};
const storeAsImage = fromPath(localPath, options);
try {
const convertResults = await storeAsImage.bulk(pagesToConvertAsImages, {
responseType: "buffer",
});
await Promise.all(
convertResults.map(async (result) => {
if (!result || !result.buffer) {
throw new Error("Could not convert page to image buffer");
}
if (!result.page) throw new Error("Could not identify page data");
const paddedPageNumber = result.page.toString().padStart(5, "0");
// Correct the image orientation
const correctedBuffer = await correctImageOrientation(result.buffer);
const imagePath = path.join(
tempDir,
`${options.saveFilename}_page_${paddedPageNumber}.png`,
);
await fs.writeFile(imagePath, correctedBuffer);
}),
);
return convertResults;
} catch (err) {
console.error("Error during PDF conversion:", err);
throw err;
}
};
// Convert each page (from other formats like docx) to a png and save that image to tmp
export const convertFileToPdf = async ({
extension,
localPath,
tempDir,
}: {
extension: string;
localPath: string;
tempDir: string;
}): Promise<string> => {
const inputBuffer = await fs.readFile(localPath);
const outputFilename = path.basename(localPath, extension) + ".pdf";
const outputPath = path.join(tempDir, outputFilename);
try {
const pdfBuffer = await convertAsync(inputBuffer, ".pdf", undefined);
await fs.writeFile(outputPath, pdfBuffer);
return outputPath;
} catch (err) {
console.error(`Error converting ${extension} to .pdf:`, err);
throw err;
}
};
const camelToSnakeCase = (str: string) =>
str.replace(/[A-Z]/g, (letter: string) => `_${letter.toLowerCase()}`);
export const convertKeysToSnakeCase = (
obj: Record<string, any> | null,
): Record<string, any> => {
if (typeof obj !== "object" || obj === null) {
return obj ?? {};
}
return Object.fromEntries(
Object.entries(obj).map(([key, value]) => [camelToSnakeCase(key), value]),
);
};