office-text-extractor-browser
Version:
Fork of office-text-extractor with unreleased changes that include browser support
43 lines (42 loc) • 1.84 kB
JavaScript
// source/parsers/excel.ts
// The text extracter for Excel files.
import Xlsx, { utils as sheetUtils } from 'xlsx';
import { dump as convertToYaml } from 'js-yaml';
const parseExcelFile = Xlsx.read;
const convertSheetToJson = sheetUtils.sheet_to_json;
export class ExcelExtractor {
constructor() {
/**
* The type(s) of input acceptable to this method.
*/
this.mimes = ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'];
/**
* Extract text from a Excel file if possible.
*
* @param payload The input and its type.
* @returns The text extracted from the input.
*/
this.apply = async (input) => {
// Read the contents of the Excel file and convert them to JSON.
const workbook = parseExcelFile(input, { type: 'buffer' });
// Get the names of all the sheets, loop through the sheets and return
// nicely formatted text.
const sheets = workbook.SheetNames;
let formattedText = '';
for (const sheet of sheets) {
// Add the sheet separator to indicate a new sheet has started.
formattedText += '===\n';
const sheetJson = convertSheetToJson(workbook.Sheets[sheet]);
for (const row of sheetJson) {
formattedText += '---\n';
formattedText += convertToYaml(row)
// If the column header is empty, the YAML converter replaces it with '__EMPTY'.
// Replace that with just underscore + the column number instead.
.replace(/__EMPTY*:/g, ':')
.replace(/__EMPTY_\d?:/g, ':');
}
}
return formattedText;
};
}
}