UNPKG

office-text-extractor-browser

Version:

Fork of office-text-extractor with unreleased changes that include browser support

51 lines (42 loc) 1.6 kB
// source/parsers/excel.ts // The text extracter for Excel files. import { type Buffer } from 'buffer/' import Xlsx, { utils as sheetUtils } from 'xlsx' import { dump as convertToYaml } from 'js-yaml' import type { TextExtractionMethod } from '../lib.js' const parseExcelFile = Xlsx.read const convertSheetToJson = sheetUtils.sheet_to_json export class ExcelExtractor implements TextExtractionMethod { /** * The type(s) of input acceptable to this method. */ mimes = ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'] /** * Extract text from a Excel file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ apply = async (input: Buffer): Promise<string> => { // Read the contents of the Excel file and convert them to JSON. const workbook = parseExcelFile(input, { type: 'buffer' }) // Get the names of all the sheets, loop through the sheets and return // nicely formatted text. const sheets = workbook.SheetNames let formattedText = '' for (const sheet of sheets) { // Add the sheet separator to indicate a new sheet has started. formattedText += '===\n' const sheetJson = convertSheetToJson(workbook.Sheets[sheet]) for (const row of sheetJson) { formattedText += '---\n' formattedText += convertToYaml(row) // If the column header is empty, the YAML converter replaces it with '__EMPTY'. // Replace that with just underscore + the column number instead. .replace(/__EMPTY*:/g, ':') .replace(/__EMPTY_\d?:/g, ':') } } return formattedText } }