@papra/lecture
Version:
A simple library to extract text from files
125 lines (115 loc) • 3.46 kB
JavaScript
;
const tesseract_js = require('tesseract.js');
const node_buffer = require('node:buffer');
const unpdf = require('unpdf');
const ocrLanguages = Object.values(tesseract_js.languages);
function parseConfig({ rawConfig = {} } = {}) {
const languages = rawConfig.tesseract?.languages ?? [];
const invalidLanguages = languages.filter((language) => !ocrLanguages.includes(language));
if (invalidLanguages.length > 0) {
throw new Error(`Invalid languages for tesseract: ${invalidLanguages.join(", ")}. Valid languages are: ${ocrLanguages.join(", ")}`);
}
return {
config: {
tesseract: {
languages: languages.length > 0 ? languages : ["eng"]
}
}
};
}
function defineTextExtractor(args) {
return args;
}
const imageExtractorDefinition = defineTextExtractor({
name: "image",
mimeTypes: [
"image/png",
"image/jpeg",
"image/webp",
"image/gif"
],
extract: async ({ arrayBuffer, config }) => {
const { languages } = config.tesseract;
const buffer = node_buffer.Buffer.from(arrayBuffer);
const worker = await tesseract_js.createWorker(languages);
const { data: { text } } = await worker.recognize(buffer);
await worker.terminate();
return { content: text };
}
});
const pdfExtractorDefinition = defineTextExtractor({
name: "pdf",
mimeTypes: ["application/pdf"],
extract: async ({ arrayBuffer }) => {
const { text } = await unpdf.extractText(arrayBuffer, { mergePages: true });
return { content: text };
}
});
const txtExtractorDefinition = defineTextExtractor({
name: "text",
mimeTypes: [
"text/*",
"application/json",
"application/xml",
"application/javascript",
"application/typescript",
"application/graphql",
"application/markdown",
"application/yaml"
],
extract: async ({ arrayBuffer }) => {
const text = new TextDecoder().decode(arrayBuffer);
return { content: text };
}
});
const extractorDefinitions = [
pdfExtractorDefinition,
txtExtractorDefinition,
imageExtractorDefinition
];
function getExtractor({
mimeType,
extractors = extractorDefinitions
}) {
const wilcardedMimeType = mimeType.replace(/\/.*/, "/*");
const extractor = extractors.find((extractor2) => extractor2.mimeTypes.includes(mimeType) || extractor2.mimeTypes.includes(wilcardedMimeType));
return {
extractor
};
}
async function extractText({ arrayBuffer, mimeType, config: rawConfig }) {
const { config } = parseConfig({ rawConfig });
const { extractor } = getExtractor({ mimeType });
if (!extractor) {
return {
extractorName: undefined,
textContent: undefined
};
}
try {
const { content } = await extractor.extract({ arrayBuffer, config });
return {
extractorName: extractor.name,
textContent: content
};
} catch (error) {
return {
error,
extractorName: extractor.name,
textContent: undefined
};
}
}
async function extractTextFromBlob({ blob, config }) {
const arrayBuffer = await blob.arrayBuffer();
const mimeType = blob.type;
return extractText({ arrayBuffer, mimeType, config });
}
async function extractTextFromFile({ file, config }) {
return extractTextFromBlob({ blob: file, config });
}
exports.extractText = extractText;
exports.extractTextFromBlob = extractTextFromBlob;
exports.extractTextFromFile = extractTextFromFile;
exports.ocrLanguages = ocrLanguages;
//# sourceMappingURL=index.cjs.map