UNPKG

@papra/lecture

Version:

A simple library to extract text from files

1 lines 8.48 kB
{"version":3,"file":"index.cjs","sources":["../src/config.ts","../src/extractors.models.ts","../src/extractors/img.extractor.ts","../src/extractors/pdf.extractor.ts","../src/extractors/txt.extractor.ts","../src/extractors.registry.ts","../src/extractors.usecases.ts"],"sourcesContent":["import type { ExtractorConfig, PartialExtractorConfig } from './types';\nimport { languages as tesseractLanguages } from 'tesseract.js';\n\nexport const ocrLanguages = Object.values(tesseractLanguages);\n\nexport function parseConfig({ rawConfig = {} }: { rawConfig?: PartialExtractorConfig } = {}): { config: ExtractorConfig } {\n const languages = rawConfig.tesseract?.languages ?? [];\n const invalidLanguages = languages.filter(language => !ocrLanguages.includes(language));\n\n if (invalidLanguages.length > 0) {\n throw new Error(`Invalid languages for tesseract: ${invalidLanguages.join(', ')}. Valid languages are: ${ocrLanguages.join(', ')}`);\n }\n\n return {\n config: {\n tesseract: {\n languages: languages.length > 0 ? languages : ['eng'],\n },\n },\n };\n}\n","import type { ExtractorConfig } from './types';\n\nexport type ExtractorDefinition = ReturnType<typeof defineTextExtractor>;\n\nexport function defineTextExtractor(args: {\n name: string;\n mimeTypes: string[];\n extract: (args: { arrayBuffer: ArrayBuffer; config: ExtractorConfig }) => Promise<{ content: string }>;\n}) {\n return args;\n}\n","import { Buffer } from 'node:buffer';\nimport { createWorker } from 'tesseract.js';\nimport { defineTextExtractor } from '../extractors.models';\n\nexport const imageExtractorDefinition = defineTextExtractor({\n name: 'image',\n mimeTypes: [\n 'image/png',\n 'image/jpeg',\n 'image/webp',\n 'image/gif',\n ],\n extract: async ({ arrayBuffer, config }) => {\n const { languages } = config.tesseract;\n\n const buffer = Buffer.from(arrayBuffer);\n\n const worker = await createWorker(languages);\n\n const { data: { text } } = await worker.recognize(buffer);\n await worker.terminate();\n\n return { content: text };\n },\n});\n","import { extractText } from 'unpdf';\nimport { defineTextExtractor } from '../extractors.models';\n\nexport const pdfExtractorDefinition = defineTextExtractor({\n name: 'pdf',\n mimeTypes: ['application/pdf'],\n extract: async ({ arrayBuffer }) => {\n const { text } = await extractText(arrayBuffer, { mergePages: true });\n\n return { content: text };\n },\n});\n","import { defineTextExtractor } from '../extractors.models';\n\nexport const txtExtractorDefinition = defineTextExtractor({\n name: 'text',\n mimeTypes: [\n 'text/*',\n 'application/json',\n 'application/xml',\n 'application/javascript',\n 'application/typescript',\n 'application/graphql',\n 'application/markdown',\n 'application/yaml',\n ],\n extract: async ({ arrayBuffer }) => {\n const text = new TextDecoder().decode(arrayBuffer);\n\n return { content: text };\n },\n});\n","import type { ExtractorDefinition } from './extractors.models';\nimport { imageExtractorDefinition } from './extractors/img.extractor';\nimport { pdfExtractorDefinition } from './extractors/pdf.extractor';\nimport { txtExtractorDefinition } from './extractors/txt.extractor';\n\nexport const extractorDefinitions: ExtractorDefinition[] = [\n pdfExtractorDefinition,\n txtExtractorDefinition,\n imageExtractorDefinition,\n];\n\nexport function getExtractor({\n mimeType,\n extractors = extractorDefinitions,\n}: {\n mimeType: string;\n extractors?: ExtractorDefinition[];\n}) {\n const wilcardedMimeType = mimeType.replace(/\\/.*/, '/*');\n const extractor = extractors.find(extractor => extractor.mimeTypes.includes(mimeType) || extractor.mimeTypes.includes(wilcardedMimeType));\n\n return {\n extractor,\n };\n}\n","import type { PartialExtractorConfig } from './types';\nimport { parseConfig } from './config';\nimport { getExtractor } from './extractors.registry';\n\nexport async function extractText({ arrayBuffer, mimeType, config: rawConfig }: { arrayBuffer: ArrayBuffer; mimeType: string; config?: PartialExtractorConfig }): Promise<{\n extractorName: string | undefined;\n textContent: string | undefined;\n error?: Error;\n}> {\n const { config } = parseConfig({ rawConfig });\n const { extractor } = getExtractor({ mimeType });\n\n if (!extractor) {\n return {\n extractorName: undefined,\n textContent: undefined,\n };\n }\n\n try {\n const { content } = await extractor.extract({ arrayBuffer, config });\n\n return {\n extractorName: extractor.name,\n textContent: content,\n };\n } catch (error) {\n return {\n error,\n extractorName: extractor.name,\n textContent: undefined,\n };\n }\n}\n\nexport async function extractTextFromBlob({ blob, config }: { blob: Blob; config?: PartialExtractorConfig }) {\n const arrayBuffer = await blob.arrayBuffer();\n const mimeType = blob.type;\n\n return extractText({ arrayBuffer, mimeType, config });\n}\n\nexport async function extractTextFromFile({ file, config }: { file: File; config?: PartialExtractorConfig }) {\n return extractTextFromBlob({ blob: file, config });\n}\n"],"names":["tesseractLanguages","Buffer","createWorker","extractText","extractor"],"mappings":";;;;;;AAGa,MAAA,YAAA,GAAe,MAAO,CAAA,MAAA,CAAOA,sBAAkB;AAErD,SAAS,YAAY,EAAE,SAAA,GAAY,EAAG,EAAA,GAA4C,EAAiC,EAAA;AACxH,EAAA,MAAM,SAAY,GAAA,SAAA,CAAU,SAAW,EAAA,SAAA,IAAa,EAAC;AACrD,EAAM,MAAA,gBAAA,GAAmB,UAAU,MAAO,CAAA,CAAA,QAAA,KAAY,CAAC,YAAa,CAAA,QAAA,CAAS,QAAQ,CAAC,CAAA;AAEtF,EAAI,IAAA,gBAAA,CAAiB,SAAS,CAAG,EAAA;AAC/B,IAAA,MAAM,IAAI,KAAA,CAAM,CAAoC,iCAAA,EAAA,gBAAA,CAAiB,IAAK,CAAA,IAAI,CAAC,CAAA,uBAAA,EAA0B,YAAa,CAAA,IAAA,CAAK,IAAI,CAAC,CAAE,CAAA,CAAA;AAAA;AAGpI,EAAO,OAAA;AAAA,IACL,MAAQ,EAAA;AAAA,MACN,SAAW,EAAA;AAAA,QACT,WAAW,SAAU,CAAA,MAAA,GAAS,CAAI,GAAA,SAAA,GAAY,CAAC,KAAK;AAAA;AACtD;AACF,GACF;AACF;;AChBO,SAAS,oBAAoB,IAIjC,EAAA;AACD,EAAO,OAAA,IAAA;AACT;;ACNO,MAAM,2BAA2B,mBAAoB,CAAA;AAAA,EAC1D,IAAM,EAAA,OAAA;AAAA,EACN,SAAW,EAAA;AAAA,IACT,WAAA;AAAA,IACA,YAAA;AAAA,IACA,YAAA;AAAA,IACA;AAAA,GACF;AAAA,EACA,OAAS,EAAA,OAAO,EAAE,WAAA,EAAa,QAAa,KAAA;AAC1C,IAAM,MAAA,EAAE,SAAU,EAAA,GAAI,MAAO,CAAA,SAAA;AAE7B,IAAM,MAAA,MAAA,GAASC,kBAAO,CAAA,IAAA,CAAK,WAAW,CAAA;AAEtC,IAAM,MAAA,MAAA,GAAS,MAAMC,yBAAA,CAAa,SAAS,CAAA;AAE3C,IAAM,MAAA,EAAE,MAAM,EAAE,IAAA,IAAW,GAAA,MAAM,MAAO,CAAA,SAAA,CAAU,MAAM,CAAA;AACxD,IAAA,MAAM,OAAO,SAAU,EAAA;AAEvB,IAAO,OAAA,EAAE,SAAS,IAAK,EAAA;AAAA;AAE3B,CAAC,CAAA;;ACrBM,MAAM,yBAAyB,mBAAoB,CAAA;AAAA,EACxD,IAAM,EAAA,KAAA;AAAA,EACN,SAAA,EAAW,CAAC,iBAAiB,CAAA;AAAA,EAC7B,OAAS,EAAA,OAAO,EAAE,WAAA,EAAkB,KAAA;AAClC,IAAM,MAAA,EAAE,MAAS,GAAA,MAAMC,kBAAY,WAAa,EAAA,EAAE,UAAY,EAAA,IAAA,EAAM,CAAA;AAEpE,IAAO,OAAA,EAAE,SAAS,IAAK,EAAA;AAAA;AAE3B,CAAC,CAAA;;ACTM,MAAM,yBAAyB,mBAAoB,CAAA;AAAA,EACxD,IAAM,EAAA,MAAA;AAAA,EACN,SAAW,EAAA;AAAA,IACT,QAAA;AAAA,IACA,kBAAA;AAAA,IACA,iBAAA;AAAA,IACA,wBAAA;AAAA,IACA,wBAAA;AAAA,IACA,qBAAA;AAAA,IACA,sBAAA;AAAA,IACA;AAAA,GACF;AAAA,EACA,OAAS,EAAA,OAAO,EAAE,WAAA,EAAkB,KAAA;AAClC,IAAA,MAAM,IAAO,GAAA,IAAI,WAAY,EAAA,CAAE,OAAO,WAAW,CAAA;AAEjD,IAAO,OAAA,EAAE,SAAS,IAAK,EAAA;AAAA;AAE3B,CAAC,CAAA;;ACdM,MAAM,oBAA8C,GAAA;AAAA,EACzD,sBAAA;AAAA,EACA,sBAAA;AAAA,EACA;AACF,CAAA;AAEO,SAAS,YAAa,CAAA;AAAA,EAC3B,QAAA;AAAA,EACA,UAAa,GAAA;AACf,CAGG,EAAA;AACD,EAAA,MAAM,iBAAoB,GAAA,QAAA,CAAS,OAAQ,CAAA,MAAA,EAAQ,IAAI,CAAA;AACvD,EAAA,MAAM,SAAY,GAAA,UAAA,CAAW,IAAK,CAAA,CAAAC,eAAaA,UAAU,CAAA,SAAA,CAAU,QAAS,CAAA,QAAQ,CAAKA,IAAAA,UAAAA,CAAU,SAAU,CAAA,QAAA,CAAS,iBAAiB,CAAC,CAAA;AAExI,EAAO,OAAA;AAAA,IACL;AAAA,GACF;AACF;;ACpBA,eAAsB,YAAY,EAAE,WAAA,EAAa,QAAU,EAAA,MAAA,EAAQ,WAIhE,EAAA;AACD,EAAA,MAAM,EAAE,MAAO,EAAA,GAAI,WAAY,CAAA,EAAE,WAAW,CAAA;AAC5C,EAAA,MAAM,EAAE,SAAU,EAAA,GAAI,YAAa,CAAA,EAAE,UAAU,CAAA;AAE/C,EAAA,IAAI,CAAC,SAAW,EAAA;AACd,IAAO,OAAA;AAAA,MACL,aAAe,EAAA,SAAA;AAAA,MACf,WAAa,EAAA;AAAA,KACf;AAAA;AAGF,EAAI,IAAA;AACF,IAAM,MAAA,EAAE,SAAY,GAAA,MAAM,UAAU,OAAQ,CAAA,EAAE,WAAa,EAAA,MAAA,EAAQ,CAAA;AAEnE,IAAO,OAAA;AAAA,MACL,eAAe,SAAU,CAAA,IAAA;AAAA,MACzB,WAAa,EAAA;AAAA,KACf;AAAA,WACO,KAAO,EAAA;AACd,IAAO,OAAA;AAAA,MACL,KAAA;AAAA,MACA,eAAe,SAAU,CAAA,IAAA;AAAA,MACzB,WAAa,EAAA;AAAA,KACf;AAAA;AAEJ;AAEA,eAAsB,mBAAoB,CAAA,EAAE,IAAM,EAAA,MAAA,EAA2D,EAAA;AAC3G,EAAM,MAAA,WAAA,GAAc,MAAM,IAAA,CAAK,WAAY,EAAA;AAC3C,EAAA,MAAM,WAAW,IAAK,CAAA,IAAA;AAEtB,EAAA,OAAO,WAAY,CAAA,EAAE,WAAa,EAAA,QAAA,EAAU,QAAQ,CAAA;AACtD;AAEA,eAAsB,mBAAoB,CAAA,EAAE,IAAM,EAAA,MAAA,EAA2D,EAAA;AAC3G,EAAA,OAAO,mBAAoB,CAAA,EAAE,IAAM,EAAA,IAAA,EAAM,QAAQ,CAAA;AACnD;;;;;;;"}