UNPKG

@papra/lecture

Version:

A simple library to extract text from files

690 lines (673 loc) 19.9 kB
import { createWorker, languages } from "tesseract.js"; import { XMLParser } from "fast-xml-parser"; import JSZip from "jszip"; import { x } from "tinyexec"; import { Buffer } from "node:buffer"; import sharp from "sharp"; import { extractImages, extractText as extractText$1, getDocumentProxy } from "unpdf"; import rtfParser from "rtf-parser"; //#region src/config.ts const ocrLanguages = Object.values(languages); function parseConfig({ rawConfig = {} } = {}) { const languages$1 = rawConfig.tesseract?.languages ?? []; const invalidLanguages = languages$1.filter((language) => !ocrLanguages.includes(language)); if (invalidLanguages.length > 0) throw new Error(`Invalid languages for tesseract: ${invalidLanguages.join(", ")}. Valid languages are: ${ocrLanguages.join(", ")}`); return { config: { ...rawConfig ?? {}, tesseract: { ...rawConfig.tesseract ?? {}, languages: languages$1.length > 0 ? languages$1 : ["eng"] } } }; } //#endregion //#region src/extractors.models.ts function defineTextExtractor(args) { return args; } //#endregion //#region src/utils/archive.ts async function getFileContentFromArchive({ arrayBuffer, filePath }) { const zip = await JSZip.loadAsync(arrayBuffer); const document = await zip.file(filePath)?.async("text"); return document; } //#endregion //#region src/extractors/doc.extractor.ts const docExtractorDefinition = defineTextExtractor({ name: "doc", mimeTypes: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], extract: async ({ arrayBuffer }) => { const documentXml = await getFileContentFromArchive({ arrayBuffer, filePath: "word/document.xml" }); if (!documentXml) return { content: "" }; const parsed = new XMLParser({ ignoreAttributes: true, isArray: () => false, textNodeName: "#text" }).parse(documentXml); const text = extractTextFromDocx(parsed); return { content: text }; } }); function extractTextFromDocx(parsed) { const paragraphs = []; function extractTextFromNode(obj) { const texts = []; function traverse(node) { if (node === null || node === void 0) return; if (typeof node === "string") { const trimmed = node.trim(); if (trimmed) texts.push(trimmed); return; } if (Array.isArray(node)) { node.forEach(traverse); return; } if (typeof node === "object") { const nodeObj = node; if (nodeObj["#text"]) { const trimmed = String(nodeObj["#text"]).trim(); if (trimmed) texts.push(trimmed); } Object.keys(nodeObj).forEach((key) => { if (key !== "#text") traverse(nodeObj[key]); }); } } traverse(obj); return texts.join(" ").replace(/\s+/g, " ").trim(); } function findParagraphs(obj) { if (obj === null || obj === void 0) return; if (Array.isArray(obj)) { obj.forEach(findParagraphs); return; } if (typeof obj === "object") { const nodeObj = obj; if (nodeObj["w:p"] !== void 0) { const paragraphData = nodeObj["w:p"]; const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData]; paragraphArray.forEach((para) => { const text = extractTextFromNode(para); if (text) paragraphs.push(text); }); } Object.keys(nodeObj).forEach((key) => { if (key !== "w:p") findParagraphs(nodeObj[key]); }); } } findParagraphs(parsed); return paragraphs.join("\n\n"); } //#endregion //#region src/utils/buffer.ts function castToBuffer(maybeArrayBuffer) { const buffer = maybeArrayBuffer instanceof ArrayBuffer ? Buffer.from(maybeArrayBuffer) : maybeArrayBuffer; return buffer; } //#endregion //#region src/utils/memoize.ts function memoize(fn, keyResolver) { const cache = {}; const memoizedFn = (...args) => { const key = keyResolver ? keyResolver(...args) : JSON.stringify(args); if (key in cache) return cache[key]; const result = fn(...args); cache[key] = result; return result; }; return memoizedFn; } //#endregion //#region src/tesseract/tesseract.usecases.ts async function isTesseractCliAvailable({ binary = "tesseract" } = {}) { const isNode = typeof process !== "undefined" && Boolean(process?.versions?.node); if (!isNode) return false; try { const result = await x(binary, ["--version"], { throwOnError: true }); return result.exitCode === 0; } catch { return false; } } const isTesseractCliAvailableMemoized = memoize(isTesseractCliAvailable, ({ binary }) => binary); function createTesseractCliExtractor({ binary = "tesseract", languages: factoryLanguages = ["eng"] } = {}) { return async (maybeArrayBuffer, { languages: languages$1 = factoryLanguages } = {}) => { try { const proc = x(binary, [ "stdin", "stdout", "-l", languages$1.join("+") ], { throwOnError: true }); proc.process.stdin.end(castToBuffer(maybeArrayBuffer)); const { stdout } = await proc; return stdout?.trim(); } catch { return ""; } }; } function createTesseractJsExtractor({ languages: factoryLanguages = ["eng"] } = {}) { return async (maybeArrayBuffer, { languages: languages$1 = factoryLanguages } = {}) => { try { const worker = await createWorker(languages$1); const { data: { text } } = await worker.recognize(castToBuffer(maybeArrayBuffer)); await worker.terminate(); return text?.trim(); } catch { return ""; } }; } async function createTesseractExtractor({ forceJs = false, binary, languages: languages$1 } = {}) { const isCliAvailable = await isTesseractCliAvailableMemoized({ binary }); if (isCliAvailable && !forceJs) return { extract: createTesseractCliExtractor({ binary, languages: languages$1 }), extractorType: "tesseract-cli" }; else return { extract: createTesseractJsExtractor({ languages: languages$1 }), extractorType: "tesseract-js" }; } //#endregion //#region src/extractors/img.extractor.ts const imageExtractorDefinition = defineTextExtractor({ name: "image", mimeTypes: [ "image/png", "image/jpeg", "image/webp", "image/gif" ], extract: async ({ arrayBuffer, config }) => { const { extract, extractorType } = await createTesseractExtractor(config.tesseract); const content = await extract(arrayBuffer); return { content, subExtractorsUsed: [extractorType] }; } }); //#endregion //#region src/extractors/odp.extractor.ts const odpExtractorDefinition = defineTextExtractor({ name: "odp", mimeTypes: ["application/vnd.oasis.opendocument.presentation"], extract: async ({ arrayBuffer }) => { const contentXml = await getFileContentFromArchive({ arrayBuffer, filePath: "content.xml" }); if (!contentXml) return { content: "" }; const parsed = new XMLParser({ ignoreAttributes: true, isArray: () => false, textNodeName: "#text" }).parse(contentXml); const text = extractTextFromOdp(parsed); return { content: text }; } }); function extractTextFromOdp(parsed) { function extractTextFromNode(obj) { const texts = []; function traverse(node) { if (node === null || node === void 0) return; if (typeof node === "string") { const trimmed = node.trim(); if (trimmed) texts.push(trimmed); return; } if (Array.isArray(node)) { node.forEach(traverse); return; } if (typeof node === "object") { const nodeObj = node; if (nodeObj["#text"]) { const trimmed = String(nodeObj["#text"]).trim(); if (trimmed) texts.push(trimmed); } Object.keys(nodeObj).forEach((key) => { if (key !== "#text") traverse(nodeObj[key]); }); } } traverse(obj); return texts.join(" ").replace(/\s+/g, " ").trim(); } const slides = []; function extractSlides(obj) { if (obj === null || obj === void 0) return; if (typeof obj === "object" && !Array.isArray(obj)) { const nodeObj = obj; if (nodeObj["draw:page"] !== void 0) { const pageData = nodeObj["draw:page"]; const pageArray = Array.isArray(pageData) ? pageData : [pageData]; pageArray.forEach((page) => { const slideText = extractTextFromSlide$1(page); if (slideText) slides.push(slideText); }); } Object.keys(nodeObj).forEach((key) => { if (key !== "draw:page") extractSlides(nodeObj[key]); }); } } function extractTextFromSlide$1(slide) { const paragraphs = []; function findParagraphs(obj) { if (obj === null || obj === void 0) return; if (Array.isArray(obj)) { obj.forEach(findParagraphs); return; } if (typeof obj === "object") { const nodeObj = obj; if (nodeObj["text:p"] !== void 0) { const paragraphData = nodeObj["text:p"]; const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData]; paragraphArray.forEach((para) => { const text = extractTextFromNode(para); if (text) paragraphs.push(text); }); } Object.keys(nodeObj).forEach((key) => { if (key !== "text:p") findParagraphs(nodeObj[key]); }); } } findParagraphs(slide); return paragraphs.join("\n\n"); } extractSlides(parsed); return slides.join("\n\n\n"); } //#endregion //#region src/extractors/odt.extractor.ts const odtExtractorDefinition = defineTextExtractor({ name: "odt", mimeTypes: ["application/vnd.oasis.opendocument.text"], extract: async ({ arrayBuffer }) => { const contentXml = await getFileContentFromArchive({ arrayBuffer, filePath: "content.xml" }); if (!contentXml) return { content: "" }; const parsed = new XMLParser({ ignoreAttributes: true, isArray: () => false, textNodeName: "#text" }).parse(contentXml); const text = extractTextFromOdt(parsed); return { content: text }; } }); function extractTextFromOdt(parsed) { const paragraphs = []; function extractTextFromNode(obj) { const texts = []; function traverse(node) { if (node === null || node === void 0) return; if (typeof node === "string") { const trimmed = node.trim(); if (trimmed) texts.push(trimmed); return; } if (Array.isArray(node)) { node.forEach(traverse); return; } if (typeof node === "object") { const nodeObj = node; if (nodeObj["#text"]) { const trimmed = String(nodeObj["#text"]).trim(); if (trimmed) texts.push(trimmed); } Object.keys(nodeObj).forEach((key) => { if (key !== "#text") traverse(nodeObj[key]); }); } } traverse(obj); return texts.join(" ").replace(/\s+/g, " ").trim(); } function findParagraphs(obj) { if (obj === null || obj === void 0) return; if (Array.isArray(obj)) { obj.forEach(findParagraphs); return; } if (typeof obj === "object") { const nodeObj = obj; if (nodeObj["text:p"] !== void 0) { const paragraphData = nodeObj["text:p"]; const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData]; paragraphArray.forEach((para) => { const text = extractTextFromNode(para); if (text) paragraphs.push(text); }); } if (nodeObj["text:h"] !== void 0) { const headingData = nodeObj["text:h"]; const headingArray = Array.isArray(headingData) ? headingData : [headingData]; headingArray.forEach((heading) => { const text = extractTextFromNode(heading); if (text) paragraphs.push(text); }); } Object.keys(nodeObj).forEach((key) => { if (key !== "text:p" && key !== "text:h") findParagraphs(nodeObj[key]); }); } } findParagraphs(parsed); return paragraphs.join("\n\n"); } //#endregion //#region src/extractors/pdf.extractor.ts const pdfExtractorDefinition = defineTextExtractor({ name: "pdf", mimeTypes: ["application/pdf"], extract: async ({ arrayBuffer, config, logger }) => { const pdf = await getDocumentProxy(arrayBuffer); const { text, totalPages: pageCount } = await extractText$1(pdf, { mergePages: true }); if (text && text.trim().length > 0) return { content: text, subExtractorsUsed: ["pdf-text"] }; logger?.debug({ pageCount }, "No text found in PDF, falling back to OCR on images."); const { extract, extractorType } = await createTesseractExtractor(config.tesseract); const imageTexts = []; const startOcrTime = Date.now(); for (let pageIndex = 1; pageIndex <= pageCount; pageIndex++) { const images = await extractImages(pdf, pageIndex); const imageCount = images.length; if (imageCount === 0) { logger?.debug({ pageIndex, pageCount }, "No images found on PDF page for OCR."); continue; } logger?.debug({ pageIndex, pageCount, imageCount }, "Extracted images from PDF page."); for (const [imageIndex, image] of images.entries()) { const startTime = Date.now(); const imageBuffer = await sharp(image.data, { raw: { width: image.width, height: image.height, channels: image.channels } }).png().toBuffer(); const bufferDelay = Date.now() - startTime; logger?.debug({ pageIndex, pageCount, imageIndex, imageCount, durationMs: bufferDelay, imageWidth: image.width, imageHeight: image.height, imageSizeBytes: image.data.length }, "Converted image to PNG buffer for OCR."); const imageText = await extract(imageBuffer); const ocrDelay = Date.now() - startTime - bufferDelay; logger?.debug({ pageIndex, pageCount, imageIndex, imageCount, durationMs: ocrDelay }, "Extracted text from image using OCR."); imageTexts.push(imageText); } } const totalOcrDuration = Date.now() - startOcrTime; logger?.info({ pageCount, imagesProcessedCount: imageTexts.length, durationMs: totalOcrDuration }, "Completed OCR on PDF images."); return { content: imageTexts.join("\n"), subExtractorsUsed: [extractorType] }; } }); //#endregion //#region src/extractors/pptx.extractor.ts const pptxExtractorDefinition = defineTextExtractor({ name: "pptx", mimeTypes: ["application/vnd.openxmlformats-officedocument.presentationml.presentation"], extract: async ({ arrayBuffer }) => { const zip = await JSZip.loadAsync(arrayBuffer); const slideFiles = Object.keys(zip.files).filter((name) => name.match(/^ppt\/slides\/slide\d+\.xml$/)).sort((a, b) => { const numA = Number.parseInt(a.match(/slide(\d+)\.xml$/)?.[1] || "0", 10); const numB = Number.parseInt(b.match(/slide(\d+)\.xml$/)?.[1] || "0", 10); return numA - numB; }); const slides = []; for (const slideFile of slideFiles) { const slideXml = await zip.file(slideFile)?.async("text"); if (!slideXml) continue; const parser = new XMLParser({ ignoreAttributes: true, isArray: () => false, textNodeName: "#text" }); const parsed = parser.parse(slideXml); const slideText = extractTextFromSlide(parsed); if (slideText) slides.push(slideText); } return { content: slides.join("\n\n\n") }; } }); function extractTextFromSlide(parsed) { function extractTextFromNode(obj) { const texts = []; function traverse(node) { if (node === null || node === void 0) return; if (typeof node === "string") { const trimmed = node.trim(); if (trimmed) texts.push(trimmed); return; } if (Array.isArray(node)) { node.forEach(traverse); return; } if (typeof node === "object") { const nodeObj = node; if (nodeObj["#text"]) { const trimmed = String(nodeObj["#text"]).trim(); if (trimmed) texts.push(trimmed); } Object.keys(nodeObj).forEach((key) => { if (key !== "#text") traverse(nodeObj[key]); }); } } traverse(obj); return texts.join(" ").replace(/\s+/g, " ").trim(); } const paragraphs = []; function findParagraphs(obj) { if (obj === null || obj === void 0) return; if (Array.isArray(obj)) { obj.forEach(findParagraphs); return; } if (typeof obj === "object") { const nodeObj = obj; if (nodeObj["a:p"] !== void 0) { const paragraphData = nodeObj["a:p"]; const paragraphArray = Array.isArray(paragraphData) ? paragraphData : [paragraphData]; paragraphArray.forEach((para) => { const text = extractTextFromNode(para); if (text) paragraphs.push(text); }); } Object.keys(nodeObj).forEach((key) => { if (key !== "a:p") findParagraphs(nodeObj[key]); }); } } findParagraphs(parsed); return paragraphs.join("\n\n"); } //#endregion //#region src/extractors/rtf.extractor.ts const rtfExtractorDefinition = defineTextExtractor({ name: "rtf", mimeTypes: ["text/rtf", "application/rtf"], extract: async ({ arrayBuffer }) => { const text = new TextDecoder().decode(arrayBuffer); const doc = await new Promise((resolve, reject) => { rtfParser.string(text, (err, parsedDoc) => { if (err) { reject(err); return; } resolve(parsedDoc); }); }); const extractedText = extractTextFromRtf(doc); return { content: extractedText }; } }); function extractTextFromRtf(doc) { function extractTextFromNode(node) { const texts = []; function traverse(item) { if (!item) return; if (typeof item.value === "string") texts.push(item.value); if (Array.isArray(item.content)) item.content.forEach(traverse); } traverse(node); return texts.join("").trim(); } const paragraphs = []; if (doc.content && Array.isArray(doc.content)) for (const contentBlock of doc.content) { const text = extractTextFromNode(contentBlock); if (text) paragraphs.push(text); } return paragraphs.join("\n\n"); } //#endregion //#region src/extractors/txt.extractor.ts const txtExtractorDefinition = defineTextExtractor({ name: "text", mimeTypes: [ "text/*", "application/json", "application/xml", "application/javascript", "application/typescript", "application/graphql", "application/markdown", "application/yaml" ], extract: async ({ arrayBuffer }) => { const text = new TextDecoder().decode(arrayBuffer); return { content: text }; } }); //#endregion //#region src/extractors.registry.ts const extractorDefinitions = [ pdfExtractorDefinition, rtfExtractorDefinition, txtExtractorDefinition, imageExtractorDefinition, docExtractorDefinition, pptxExtractorDefinition, odtExtractorDefinition, odpExtractorDefinition ]; function getExtractor({ mimeType, extractors = extractorDefinitions }) { const wilcardedMimeType = mimeType.replace(/\/.*/, "/*"); const extractor = extractors.find((extractor$1) => extractor$1.mimeTypes.includes(mimeType) || extractor$1.mimeTypes.includes(wilcardedMimeType)); return { extractor }; } //#endregion //#region src/extractors.usecases.ts async function extractText({ arrayBuffer, mimeType, config: rawConfig, logger }) { const { config } = parseConfig({ rawConfig }); const { extractor } = getExtractor({ mimeType }); if (!extractor) { logger?.warn({ mimeType }, "No extractor found"); return { extractorName: void 0, extractorType: void 0, textContent: void 0, subExtractorsUsed: [] }; } const extractorName = extractor.name; try { logger?.debug({ extractorName, mimeType }, "Starting extraction"); const startTime = Date.now(); const { content, subExtractorsUsed } = await extractor.extract({ arrayBuffer, config, logger }); const duration = Date.now() - startTime; const extractorType = [extractorName, ...subExtractorsUsed ?? []].join(":"); logger?.info({ extractorName, extractorType, mimeType, durationMs: duration }, "Extraction completed"); return { extractorName, extractorType, textContent: content, subExtractorsUsed }; } catch (error) { return { error, extractorName, extractorType: void 0, textContent: void 0, subExtractorsUsed: [] }; } } async function extractTextFromBlob({ blob,...rest }) { const arrayBuffer = await blob.arrayBuffer(); const mimeType = blob.type; return extractText({ arrayBuffer, mimeType, ...rest }); } async function extractTextFromFile({ file,...rest }) { return extractTextFromBlob({ blob: file, ...rest }); } //#endregion export { extractText, extractTextFromBlob, extractTextFromFile, ocrLanguages }; //# sourceMappingURL=index.js.map