office-text-extractor

Version:

Yet another library to extract text from MS Office and PDF files

github.com/gamemaker1/office-text-extractor

29 lines (24 loc) • 788 B

text/typescript

// source/parsers/pdf.ts // The text extracter for PDF files. import { PDFParse } from 'pdf-parse' import type { TextExtractionMethod } from '../lib.js' export class PdfExtractor implements TextExtractionMethod { /** * The type(s) of input acceptable to this method. */ mimes = ['application/pdf'] /** * Extract text from a PDF file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ apply = async (input: Uint8Array): Promise<string> => { // Create a new parser and run it on the given input buffer. const parser = new PDFParse({ data: input }) const result = await parser.getText({ parseHyperlinks: true }) // Clean up the parser and return the text. await parser.destroy() return result.text } }