@papra/lecture
Version:
A simple library to extract text from files
690 lines (673 loc) • 19.9 kB
JavaScript
import { createWorker, languages } from "tesseract.js";
import { XMLParser } from "fast-xml-parser";
import JSZip from "jszip";
import { x } from "tinyexec";
import { Buffer } from "node:buffer";
import sharp from "sharp";
import { extractImages, extractText as extractText$1, getDocumentProxy } from "unpdf";
import rtfParser from "rtf-parser";
//#region src/config.ts
const ocrLanguages = Object.values(languages);
function parseConfig({ rawConfig = {} } = {}) {
const languages$1 = rawConfig.tesseract?.languages ?? [];
const invalidLanguages = languages$1.filter((language) => !ocrLanguages.includes(language));
if (invalidLanguages.length > 0) throw new Error(`Invalid languages for tesseract: ${invalidLanguages.join(", ")}. Valid languages are: ${ocrLanguages.join(", ")}`);
return { config: {
...rawConfig ?? {},
tesseract: {
...rawConfig.tesseract ?? {},
languages: languages$1.length > 0 ? languages$1 : ["eng"]
}
} };
}
//#endregion
//#region src/extractors.models.ts
function defineTextExtractor(args) {
return args;
}
//#endregion
//#region src/utils/archive.ts
async function getFileContentFromArchive({ arrayBuffer, filePath }) {
const zip = await JSZip.loadAsync(arrayBuffer);
const document = await zip.file(filePath)?.async("text");
return document;
}
//#endregion
//#region src/extractors/doc.extractor.ts
const docExtractorDefinition = defineTextExtractor({
name: "doc",
mimeTypes: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
extract: async ({ arrayBuffer }) => {
const documentXml = await getFileContentFromArchive({
arrayBuffer,
filePath: "word/document.xml"
});
if (!documentXml) return { content: "" };
const parsed = new XMLParser({
ignoreAttributes: true,
isArray: () => false,
textNodeName: "#text"
}).parse(documentXml);
const text = extractTextFromDocx(parsed);
return { content: text };
}
});
function extractTextFromDocx(parsed) {
const paragraphs = [];
function extractTextFromNode(obj) {
const texts = [];
function traverse(node) {
if (node === null || node === void 0) return;
if (typeof node === "string") {
const trimmed = node.trim();
if (trimmed) texts.push(trimmed);
return;
}
if (Array.isArray(node)) {
node.forEach(traverse);
return;
}
if (typeof node === "object") {
const nodeObj = node;
if (nodeObj["#text"]) {
const trimmed = String(nodeObj["#text"]).trim();
if (trimmed) texts.push(trimmed);
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "#text") traverse(nodeObj[key]);
});
}
}
traverse(obj);
return texts.join(" ").replace(/\s+/g, " ").trim();
}
function findParagraphs(obj) {
if (obj === null || obj === void 0) return;
if (Array.isArray(obj)) {
obj.forEach(findParagraphs);
return;
}
if (typeof obj === "object") {
const nodeObj = obj;
if (nodeObj["w:p"] !== void 0) {
const paragraphData = nodeObj["w:p"];
const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData];
paragraphArray.forEach((para) => {
const text = extractTextFromNode(para);
if (text) paragraphs.push(text);
});
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "w:p") findParagraphs(nodeObj[key]);
});
}
}
findParagraphs(parsed);
return paragraphs.join("\n\n");
}
//#endregion
//#region src/utils/buffer.ts
function castToBuffer(maybeArrayBuffer) {
const buffer = maybeArrayBuffer instanceof ArrayBuffer ? Buffer.from(maybeArrayBuffer) : maybeArrayBuffer;
return buffer;
}
//#endregion
//#region src/utils/memoize.ts
function memoize(fn, keyResolver) {
const cache = {};
const memoizedFn = (...args) => {
const key = keyResolver ? keyResolver(...args) : JSON.stringify(args);
if (key in cache) return cache[key];
const result = fn(...args);
cache[key] = result;
return result;
};
return memoizedFn;
}
//#endregion
//#region src/tesseract/tesseract.usecases.ts
async function isTesseractCliAvailable({ binary = "tesseract" } = {}) {
const isNode = typeof process !== "undefined" && Boolean(process?.versions?.node);
if (!isNode) return false;
try {
const result = await x(binary, ["--version"], { throwOnError: true });
return result.exitCode === 0;
} catch {
return false;
}
}
const isTesseractCliAvailableMemoized = memoize(isTesseractCliAvailable, ({ binary }) => binary);
function createTesseractCliExtractor({ binary = "tesseract", languages: factoryLanguages = ["eng"] } = {}) {
return async (maybeArrayBuffer, { languages: languages$1 = factoryLanguages } = {}) => {
try {
const proc = x(binary, [
"stdin",
"stdout",
"-l",
languages$1.join("+")
], { throwOnError: true });
proc.process.stdin.end(castToBuffer(maybeArrayBuffer));
const { stdout } = await proc;
return stdout?.trim();
} catch {
return "";
}
};
}
function createTesseractJsExtractor({ languages: factoryLanguages = ["eng"] } = {}) {
return async (maybeArrayBuffer, { languages: languages$1 = factoryLanguages } = {}) => {
try {
const worker = await createWorker(languages$1);
const { data: { text } } = await worker.recognize(castToBuffer(maybeArrayBuffer));
await worker.terminate();
return text?.trim();
} catch {
return "";
}
};
}
async function createTesseractExtractor({ forceJs = false, binary, languages: languages$1 } = {}) {
const isCliAvailable = await isTesseractCliAvailableMemoized({ binary });
if (isCliAvailable && !forceJs) return {
extract: createTesseractCliExtractor({
binary,
languages: languages$1
}),
extractorType: "tesseract-cli"
};
else return {
extract: createTesseractJsExtractor({ languages: languages$1 }),
extractorType: "tesseract-js"
};
}
//#endregion
//#region src/extractors/img.extractor.ts
const imageExtractorDefinition = defineTextExtractor({
name: "image",
mimeTypes: [
"image/png",
"image/jpeg",
"image/webp",
"image/gif"
],
extract: async ({ arrayBuffer, config }) => {
const { extract, extractorType } = await createTesseractExtractor(config.tesseract);
const content = await extract(arrayBuffer);
return {
content,
subExtractorsUsed: [extractorType]
};
}
});
//#endregion
//#region src/extractors/odp.extractor.ts
const odpExtractorDefinition = defineTextExtractor({
name: "odp",
mimeTypes: ["application/vnd.oasis.opendocument.presentation"],
extract: async ({ arrayBuffer }) => {
const contentXml = await getFileContentFromArchive({
arrayBuffer,
filePath: "content.xml"
});
if (!contentXml) return { content: "" };
const parsed = new XMLParser({
ignoreAttributes: true,
isArray: () => false,
textNodeName: "#text"
}).parse(contentXml);
const text = extractTextFromOdp(parsed);
return { content: text };
}
});
function extractTextFromOdp(parsed) {
function extractTextFromNode(obj) {
const texts = [];
function traverse(node) {
if (node === null || node === void 0) return;
if (typeof node === "string") {
const trimmed = node.trim();
if (trimmed) texts.push(trimmed);
return;
}
if (Array.isArray(node)) {
node.forEach(traverse);
return;
}
if (typeof node === "object") {
const nodeObj = node;
if (nodeObj["#text"]) {
const trimmed = String(nodeObj["#text"]).trim();
if (trimmed) texts.push(trimmed);
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "#text") traverse(nodeObj[key]);
});
}
}
traverse(obj);
return texts.join(" ").replace(/\s+/g, " ").trim();
}
const slides = [];
function extractSlides(obj) {
if (obj === null || obj === void 0) return;
if (typeof obj === "object" && !Array.isArray(obj)) {
const nodeObj = obj;
if (nodeObj["draw:page"] !== void 0) {
const pageData = nodeObj["draw:page"];
const pageArray = Array.isArray(pageData) ? pageData : [pageData];
pageArray.forEach((page) => {
const slideText = extractTextFromSlide$1(page);
if (slideText) slides.push(slideText);
});
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "draw:page") extractSlides(nodeObj[key]);
});
}
}
function extractTextFromSlide$1(slide) {
const paragraphs = [];
function findParagraphs(obj) {
if (obj === null || obj === void 0) return;
if (Array.isArray(obj)) {
obj.forEach(findParagraphs);
return;
}
if (typeof obj === "object") {
const nodeObj = obj;
if (nodeObj["text:p"] !== void 0) {
const paragraphData = nodeObj["text:p"];
const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData];
paragraphArray.forEach((para) => {
const text = extractTextFromNode(para);
if (text) paragraphs.push(text);
});
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "text:p") findParagraphs(nodeObj[key]);
});
}
}
findParagraphs(slide);
return paragraphs.join("\n\n");
}
extractSlides(parsed);
return slides.join("\n\n\n");
}
//#endregion
//#region src/extractors/odt.extractor.ts
const odtExtractorDefinition = defineTextExtractor({
name: "odt",
mimeTypes: ["application/vnd.oasis.opendocument.text"],
extract: async ({ arrayBuffer }) => {
const contentXml = await getFileContentFromArchive({
arrayBuffer,
filePath: "content.xml"
});
if (!contentXml) return { content: "" };
const parsed = new XMLParser({
ignoreAttributes: true,
isArray: () => false,
textNodeName: "#text"
}).parse(contentXml);
const text = extractTextFromOdt(parsed);
return { content: text };
}
});
function extractTextFromOdt(parsed) {
const paragraphs = [];
function extractTextFromNode(obj) {
const texts = [];
function traverse(node) {
if (node === null || node === void 0) return;
if (typeof node === "string") {
const trimmed = node.trim();
if (trimmed) texts.push(trimmed);
return;
}
if (Array.isArray(node)) {
node.forEach(traverse);
return;
}
if (typeof node === "object") {
const nodeObj = node;
if (nodeObj["#text"]) {
const trimmed = String(nodeObj["#text"]).trim();
if (trimmed) texts.push(trimmed);
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "#text") traverse(nodeObj[key]);
});
}
}
traverse(obj);
return texts.join(" ").replace(/\s+/g, " ").trim();
}
function findParagraphs(obj) {
if (obj === null || obj === void 0) return;
if (Array.isArray(obj)) {
obj.forEach(findParagraphs);
return;
}
if (typeof obj === "object") {
const nodeObj = obj;
if (nodeObj["text:p"] !== void 0) {
const paragraphData = nodeObj["text:p"];
const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData];
paragraphArray.forEach((para) => {
const text = extractTextFromNode(para);
if (text) paragraphs.push(text);
});
}
if (nodeObj["text:h"] !== void 0) {
const headingData = nodeObj["text:h"];
const headingArray = Array.isArray(headingData) ? headingData : [headingData];
headingArray.forEach((heading) => {
const text = extractTextFromNode(heading);
if (text) paragraphs.push(text);
});
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "text:p" && key !== "text:h") findParagraphs(nodeObj[key]);
});
}
}
findParagraphs(parsed);
return paragraphs.join("\n\n");
}
//#endregion
//#region src/extractors/pdf.extractor.ts
const pdfExtractorDefinition = defineTextExtractor({
name: "pdf",
mimeTypes: ["application/pdf"],
extract: async ({ arrayBuffer, config, logger }) => {
const pdf = await getDocumentProxy(arrayBuffer);
const { text, totalPages: pageCount } = await extractText$1(pdf, { mergePages: true });
if (text && text.trim().length > 0) return {
content: text,
subExtractorsUsed: ["pdf-text"]
};
logger?.debug({ pageCount }, "No text found in PDF, falling back to OCR on images.");
const { extract, extractorType } = await createTesseractExtractor(config.tesseract);
const imageTexts = [];
const startOcrTime = Date.now();
for (let pageIndex = 1; pageIndex <= pageCount; pageIndex++) {
const images = await extractImages(pdf, pageIndex);
const imageCount = images.length;
if (imageCount === 0) {
logger?.debug({
pageIndex,
pageCount
}, "No images found on PDF page for OCR.");
continue;
}
logger?.debug({
pageIndex,
pageCount,
imageCount
}, "Extracted images from PDF page.");
for (const [imageIndex, image] of images.entries()) {
const startTime = Date.now();
const imageBuffer = await sharp(image.data, { raw: {
width: image.width,
height: image.height,
channels: image.channels
} }).png().toBuffer();
const bufferDelay = Date.now() - startTime;
logger?.debug({
pageIndex,
pageCount,
imageIndex,
imageCount,
durationMs: bufferDelay,
imageWidth: image.width,
imageHeight: image.height,
imageSizeBytes: image.data.length
}, "Converted image to PNG buffer for OCR.");
const imageText = await extract(imageBuffer);
const ocrDelay = Date.now() - startTime - bufferDelay;
logger?.debug({
pageIndex,
pageCount,
imageIndex,
imageCount,
durationMs: ocrDelay
}, "Extracted text from image using OCR.");
imageTexts.push(imageText);
}
}
const totalOcrDuration = Date.now() - startOcrTime;
logger?.info({
pageCount,
imagesProcessedCount: imageTexts.length,
durationMs: totalOcrDuration
}, "Completed OCR on PDF images.");
return {
content: imageTexts.join("\n"),
subExtractorsUsed: [extractorType]
};
}
});
//#endregion
//#region src/extractors/pptx.extractor.ts
const pptxExtractorDefinition = defineTextExtractor({
name: "pptx",
mimeTypes: ["application/vnd.openxmlformats-officedocument.presentationml.presentation"],
extract: async ({ arrayBuffer }) => {
const zip = await JSZip.loadAsync(arrayBuffer);
const slideFiles = Object.keys(zip.files).filter((name) => name.match(/^ppt\/slides\/slide\d+\.xml$/)).sort((a, b) => {
const numA = Number.parseInt(a.match(/slide(\d+)\.xml$/)?.[1] || "0", 10);
const numB = Number.parseInt(b.match(/slide(\d+)\.xml$/)?.[1] || "0", 10);
return numA - numB;
});
const slides = [];
for (const slideFile of slideFiles) {
const slideXml = await zip.file(slideFile)?.async("text");
if (!slideXml) continue;
const parser = new XMLParser({
ignoreAttributes: true,
isArray: () => false,
textNodeName: "#text"
});
const parsed = parser.parse(slideXml);
const slideText = extractTextFromSlide(parsed);
if (slideText) slides.push(slideText);
}
return { content: slides.join("\n\n\n") };
}
});
function extractTextFromSlide(parsed) {
function extractTextFromNode(obj) {
const texts = [];
function traverse(node) {
if (node === null || node === void 0) return;
if (typeof node === "string") {
const trimmed = node.trim();
if (trimmed) texts.push(trimmed);
return;
}
if (Array.isArray(node)) {
node.forEach(traverse);
return;
}
if (typeof node === "object") {
const nodeObj = node;
if (nodeObj["#text"]) {
const trimmed = String(nodeObj["#text"]).trim();
if (trimmed) texts.push(trimmed);
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "#text") traverse(nodeObj[key]);
});
}
}
traverse(obj);
return texts.join(" ").replace(/\s+/g, " ").trim();
}
const paragraphs = [];
function findParagraphs(obj) {
if (obj === null || obj === void 0) return;
if (Array.isArray(obj)) {
obj.forEach(findParagraphs);
return;
}
if (typeof obj === "object") {
const nodeObj = obj;
if (nodeObj["a:p"] !== void 0) {
const paragraphData = nodeObj["a:p"];
const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData];
paragraphArray.forEach((para) => {
const text = extractTextFromNode(para);
if (text) paragraphs.push(text);
});
}
Object.keys(nodeObj).forEach((key) => {
if (key !== "a:p") findParagraphs(nodeObj[key]);
});
}
}
findParagraphs(parsed);
return paragraphs.join("\n\n");
}
//#endregion
//#region src/extractors/rtf.extractor.ts
const rtfExtractorDefinition = defineTextExtractor({
name: "rtf",
mimeTypes: ["text/rtf", "application/rtf"],
extract: async ({ arrayBuffer }) => {
const text = new TextDecoder().decode(arrayBuffer);
const doc = await new Promise((resolve, reject) => {
rtfParser.string(text, (err, parsedDoc) => {
if (err) {
reject(err);
return;
}
resolve(parsedDoc);
});
});
const extractedText = extractTextFromRtf(doc);
return { content: extractedText };
}
});
function extractTextFromRtf(doc) {
function extractTextFromNode(node) {
const texts = [];
function traverse(item) {
if (!item) return;
if (typeof item.value === "string") texts.push(item.value);
if (Array.isArray(item.content)) item.content.forEach(traverse);
}
traverse(node);
return texts.join("").trim();
}
const paragraphs = [];
if (doc.content && Array.isArray(doc.content)) for (const contentBlock of doc.content) {
const text = extractTextFromNode(contentBlock);
if (text) paragraphs.push(text);
}
return paragraphs.join("\n\n");
}
//#endregion
//#region src/extractors/txt.extractor.ts
const txtExtractorDefinition = defineTextExtractor({
name: "text",
mimeTypes: [
"text/*",
"application/json",
"application/xml",
"application/javascript",
"application/typescript",
"application/graphql",
"application/markdown",
"application/yaml"
],
extract: async ({ arrayBuffer }) => {
const text = new TextDecoder().decode(arrayBuffer);
return { content: text };
}
});
//#endregion
//#region src/extractors.registry.ts
const extractorDefinitions = [
pdfExtractorDefinition,
rtfExtractorDefinition,
txtExtractorDefinition,
imageExtractorDefinition,
docExtractorDefinition,
pptxExtractorDefinition,
odtExtractorDefinition,
odpExtractorDefinition
];
function getExtractor({ mimeType, extractors = extractorDefinitions }) {
const wilcardedMimeType = mimeType.replace(/\/.*/, "/*");
const extractor = extractors.find((extractor$1) => extractor$1.mimeTypes.includes(mimeType) || extractor$1.mimeTypes.includes(wilcardedMimeType));
return { extractor };
}
//#endregion
//#region src/extractors.usecases.ts
async function extractText({ arrayBuffer, mimeType, config: rawConfig, logger }) {
const { config } = parseConfig({ rawConfig });
const { extractor } = getExtractor({ mimeType });
if (!extractor) {
logger?.warn({ mimeType }, "No extractor found");
return {
extractorName: void 0,
extractorType: void 0,
textContent: void 0,
subExtractorsUsed: []
};
}
const extractorName = extractor.name;
try {
logger?.debug({
extractorName,
mimeType
}, "Starting extraction");
const startTime = Date.now();
const { content, subExtractorsUsed } = await extractor.extract({
arrayBuffer,
config,
logger
});
const duration = Date.now() - startTime;
const extractorType = [extractorName, ...subExtractorsUsed ?? []].join(":");
logger?.info({
extractorName,
extractorType,
mimeType,
durationMs: duration
}, "Extraction completed");
return {
extractorName,
extractorType,
textContent: content,
subExtractorsUsed
};
} catch (error) {
return {
error,
extractorName,
extractorType: void 0,
textContent: void 0,
subExtractorsUsed: []
};
}
}
async function extractTextFromBlob({ blob,...rest }) {
const arrayBuffer = await blob.arrayBuffer();
const mimeType = blob.type;
return extractText({
arrayBuffer,
mimeType,
...rest
});
}
async function extractTextFromFile({ file,...rest }) {
return extractTextFromBlob({
blob: file,
...rest
});
}
//#endregion
export { extractText, extractTextFromBlob, extractTextFromFile, ocrLanguages };
//# sourceMappingURL=index.js.map