UNPKG

read-excel-file

Version:

Read `*.xlsx` files in a browser or Node.js. Parse to JSON with a strict schema.

github.com/catamphetamine/read-excel-file

catamphetamine/read-excel-file

593 lines (516 loc) • 19.6 kB

JavaScript

'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); var _slicedToArray = function () { function sliceIterator(arr, i) { var _arr = []; var _n = true; var _d = false; var _e = undefined; try { for (var _i = arr[Symbol.iterator](), _s; !(_n = (_s = _i.next()).done); _n = true) { _arr.push(_s.value); if (i && _arr.length === i) break; } } catch (err) { _d = true; _e = err; } finally { try { if (!_n && _i["return"]) _i["return"](); } finally { if (_d) throw _e; } } return _arr; } return function (arr, i) { if (Array.isArray(arr)) { return arr; } else if (Symbol.iterator in Object(arr)) { return sliceIterator(arr, i); } else { throw new TypeError("Invalid attempt to destructure non-iterable instance"); } }; }(); var _extends = Object.assign || function (target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (Object.prototype.hasOwnProperty.call(source, key)) { target[key] = source[key]; } } } return target; }; exports.default = readXlsx; exports.dropEmptyRows = dropEmptyRows; exports.dropEmptyColumns = dropEmptyColumns; var _parseDate = require('./parseDate'); var _parseDate2 = _interopRequireDefault(_parseDate); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } var namespaces = { a: 'http://schemas.openxmlformats.org/spreadsheetml/2006/main', // This one seems to be for `r:id` attributes on `<sheet>`s. r: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', // This one seems to be for `<Relationships/>` file. rr: 'http://schemas.openxmlformats.org/package/2006/relationships' // Maps "A1"-like coordinates to `{ row, column }` numeric coordinates. };var letters = ["", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]; /** * Reads an (unzipped) XLSX file structure into a 2D array of cells. * @param {object} contents - A list of XML files inside XLSX file (which is a zipped directory). * @param {number?} options.sheet - Workbook sheet id (`1` by default). * @param {string?} options.dateFormat - Date format, e.g. "MM/DD/YY". Values having this format template set will be parsed as dates. * @param {object} contents - A list of XML files inside XLSX file (which is a zipped directory). * @return {object} An object of shape `{ data, cells, properties }`. `data: string[][]` is an array of rows, each row being an array of cell values. `cells: string[][]` is an array of rows, each row being an array of cells. `properties: object` is the spreadsheet properties (e.g. whether date epoch is 1904 instead of 1900). */ function readXlsx(contents, xml) { var options = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {}; if (!options.sheet) { options = _extends({ sheet: 1 }, options); } // Some Excel editors don't want to use standard naming scheme for sheet files. // https://github.com/tidyverse/readxl/issues/104 var fileNames = parseFileNames(contents['xl/_rels/workbook.xml.rels'], xml); // Default file path for "shared strings": "xl/sharedStrings.xml". var values = parseValues(contents['xl/' + fileNames.sharedStrings], xml); // Default file path for "styles": "xl/styles.xml". var styles = parseStyles(contents['xl/' + fileNames.styles], xml); var properties = parseProperties(contents['xl/workbook.xml'], xml); // A feature for getting the list of sheets in an Excel file. // https://github.com/catamphetamine/read-excel-file/issues/14 if (options.getSheets) { return properties.sheets.map(function (_ref) { var name = _ref.name; return { name: name }; }); } // Find the sheet by name, or take the first one. var sheetRelationId = void 0; if (typeof options.sheet === 'number') { var _sheet = properties.sheets[options.sheet - 1]; sheetRelationId = _sheet && _sheet.relationId; } else { for (var _iterator = properties.sheets, _isArray = Array.isArray(_iterator), _i = 0, _iterator = _isArray ? _iterator : _iterator[Symbol.iterator]();;) { var _ref2; if (_isArray) { if (_i >= _iterator.length) break; _ref2 = _iterator[_i++]; } else { _i = _iterator.next(); if (_i.done) break; _ref2 = _i.value; } var _sheet2 = _ref2; if (_sheet2.name === options.sheet) { sheetRelationId = _sheet2.relationId; break; } } } // If the sheet wasn't found then throw an error. // Example: "xl/worksheets/sheet1.xml". if (!sheetRelationId || !fileNames.sheets[sheetRelationId]) { throw createSheetNotFoundError(options.sheet, properties.sheets); } // Parse sheet data. var sheet = parseSheet(contents['xl/' + fileNames.sheets[sheetRelationId]], xml, values, styles, properties, options); // If the sheet is empty. if (sheet.cells.length === 0) { if (options.properties) { return { data: [], properties: properties }; } return []; } var _sheet$dimensions = _slicedToArray(sheet.dimensions, 2), leftTop = _sheet$dimensions[0], rightBottom = _sheet$dimensions[1]; var colsCount = rightBottom.column - leftTop.column + 1; var rowsCount = rightBottom.row - leftTop.row + 1; // `sheet.cells` seem to not necessarily be sorted by row and column. var data = new Array(rowsCount); var i = 0; while (i < rowsCount) { data[i] = new Array(colsCount); var j = 0; while (j < colsCount) { data[i][j] = null; j++; } i++; } for (var _iterator2 = sheet.cells, _isArray2 = Array.isArray(_iterator2), _i2 = 0, _iterator2 = _isArray2 ? _iterator2 : _iterator2[Symbol.iterator]();;) { var _ref3; if (_isArray2) { if (_i2 >= _iterator2.length) break; _ref3 = _iterator2[_i2++]; } else { _i2 = _iterator2.next(); if (_i2.done) break; _ref3 = _i2.value; } var cell = _ref3; var row = cell.row - leftTop.row; var column = cell.column - leftTop.column; data[row][column] = cell.value; } if (options.transformData) { data = options.transformData(data); } data = dropEmptyRows(dropEmptyColumns(data), options.rowMap); if (options.properties) { return { data: data, properties: properties }; } return data; } function calculateDimensions(cells) { var comparator = function comparator(a, b) { return a - b; }; var allRows = cells.map(function (cell) { return cell.row; }).sort(comparator); var allCols = cells.map(function (cell) { return cell.column; }).sort(comparator); var minRow = allRows[0]; var maxRow = allRows[allRows.length - 1]; var minCol = allCols[0]; var maxCol = allCols[allCols.length - 1]; return [{ row: minRow, column: minCol }, { row: maxRow, column: maxCol }]; } function colToInt(col) { // `for ... of ...` would require Babel polyfill for iterating a string. var n = 0; var i = 0; while (i < col.length) { n *= 26; n += letters.indexOf(col[i]); i++; } return n; } function CellCoords(coords) { // Examples: "AA2091", "R988", "B1" coords = coords.split(/(\d+)/); return [ // Row. parseInt(coords[1]), // Column. colToInt(coords[0].trim())]; } function Cell(cellNode, sheet, xml, values, styles, properties, options) { var coords = CellCoords(cellNode.getAttribute('r')); var value = xml.select(sheet, cellNode, 'a:v', namespaces)[0]; // For `xpath` `value` can be `undefined` while for native `DOMParser` it's `null`. // So using `value && ...` instead of `if (value !== undefined) { ... }` here. value = value && value.textContent; // http://webapp.docx4java.org/OnlineDemo/ecma376/SpreadsheetML/ST_CellType.html switch (cellNode.getAttribute('t')) { // If the cell contains formula string. case 'str': value = value.trim(); if (value === '') { value = undefined; } break; // If the cell contains a "shared" string. case 's': // If a cell has no value then there's no `<c/>` element for it. // If a `<c/>` element exists then it's not empty. // The `<v/>`alue is a key in the "shared strings" dictionary of the // XLSX file, so look it up in the `values` dictionary by the numeric key. value = values[parseInt(value)].trim(); if (value === '') { value = undefined; } break; case 'b': value = value === '1' ? true : false; break; case 'n': // Default type is "n". // http://www.datypic.com/sc/ooxml/t-ssml_CT_Cell.html default: if (value === undefined) { break; } value = parseFloat(value); // XLSX has no specific format for dates. // Sometimes a date can be heuristically detected. // https://github.com/catamphetamine/read-excel-file/issues/3#issuecomment-395770777 var style = styles[parseInt(cellNode.getAttribute('s') || 0)]; if (style.numberFormat.id >= 14 && style.numberFormat.id <= 22 || style.numberFormat.id >= 45 && style.numberFormat.id <= 47 || options.dateFormat && style.numberFormat.template === options.dateFormat || options.smartDateParser !== false && style.numberFormat.template && isDateTemplate(style.numberFormat.template)) { value = (0, _parseDate2.default)(value, properties); } break; } // Convert empty values to `null`. if (value === undefined) { value = null; } return { row: coords[0], column: coords[1], value: value }; } function dropEmptyRows(data, rowMap) { var accessor = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : function (_) { return _; }; // Fill in row map. if (rowMap) { var j = 0; while (j < data.length) { rowMap[j] = j; j++; } } // Drop empty rows. var i = data.length - 1; while (i >= 0) { // Check if the row is empty. var empty = true; for (var _iterator3 = data[i], _isArray3 = Array.isArray(_iterator3), _i3 = 0, _iterator3 = _isArray3 ? _iterator3 : _iterator3[Symbol.iterator]();;) { var _ref4; if (_isArray3) { if (_i3 >= _iterator3.length) break; _ref4 = _iterator3[_i3++]; } else { _i3 = _iterator3.next(); if (_i3.done) break; _ref4 = _i3.value; } var cell = _ref4; if (accessor(cell) !== null) { empty = false; break; } } // Remove the empty row. if (empty) { data.splice(i, 1); if (rowMap) { rowMap.splice(i, 1); } } i--; } return data; } function dropEmptyColumns(data) { var accessor = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : function (_) { return _; }; var i = data[0].length - 1; while (i >= 0) { var empty = true; for (var _iterator4 = data, _isArray4 = Array.isArray(_iterator4), _i4 = 0, _iterator4 = _isArray4 ? _iterator4 : _iterator4[Symbol.iterator]();;) { var _ref5; if (_isArray4) { if (_i4 >= _iterator4.length) break; _ref5 = _iterator4[_i4++]; } else { _i4 = _iterator4.next(); if (_i4.done) break; _ref5 = _i4.value; } var row = _ref5; if (accessor(row[i]) !== null) { empty = false; break; } } if (empty) { var j = 0; while (j < data.length) { data[j].splice(i, 1); j++; } } i--; } return data; } function parseSheet(content, xml, values, styles, properties, options) { var sheet = xml.createDocument(content); var cells = xml.select(sheet, null, '/a:worksheet/a:sheetData/a:row/a:c', namespaces).map(function (node) { return Cell(node, sheet, xml, values, styles, properties, options); }); if (cells.length === 0) { return { cells: cells }; } // "//a:dimension/@ref" causes "RangeError: Maximum call stack size exceeded" error. // That selector was in the legacy code I copy-pasted and no one knows why it was there. // let dimensions = xml.select(sheet, null, '//a:dimension/@ref', namespaces)[0] var dimensions = xml.select(sheet, null, '/a:worksheet/a:dimension/@ref', namespaces)[0]; if (dimensions) { dimensions = dimensions.textContent.split(':').map(CellCoords).map(function (_ref6) { var _ref7 = _slicedToArray(_ref6, 2), row = _ref7[0], column = _ref7[1]; return { row: row, column: column }; }); // When there's only a single cell on a sheet // there can sometimes be just "A1" for the dimensions string. if (dimensions.length === 1) { dimensions = [dimensions[0], dimensions[0]]; } } else { dimensions = calculateDimensions(cells); } return { cells: cells, dimensions: dimensions }; } function parseValues(content, xml) { if (!content) { return []; } var strings = xml.createDocument(content); return xml.select(strings, null, '//a:si', namespaces).map(function (string) { return xml.select(strings, string, './/a:t[not(ancestor::a:rPh)]', namespaces).map(function (_) { return _.textContent; }).join(''); }); } // http://officeopenxml.com/SSstyles.php function parseStyles(content, xml) { if (!content) { return {}; } // https://social.msdn.microsoft.com/Forums/sqlserver/en-US/708978af-b598-45c4-a598-d3518a5a09f0/howwhen-is-cellstylexfs-vs-cellxfs-applied-to-a-cell?forum=os_binaryfile // https://www.office-forums.com/threads/cellxfs-cellstylexfs.2163519/ var doc = xml.createDocument(content); var baseStyles = xml.select(doc, null, '//a:styleSheet/a:cellStyleXfs/a:xf', namespaces).map(parseCellStyle); var numFmts = xml.select(doc, null, '//a:styleSheet/a:numFmts/a:numFmt', namespaces).map(parseNumberFormatStyle).reduce(function (formats, format) { formats[format.id] = format; return formats; }, []); return xml.select(doc, null, '//a:styleSheet/a:cellXfs/a:xf', namespaces).map(function (xf) { if (xf.hasAttribute('xfId')) { return _extends({}, baseStyles[xf.xfId], parseCellStyle(xf, numFmts)); } return parseCellStyle(xf, numFmts); }); } function parseNumberFormatStyle(numFmt) { return { id: numFmt.getAttribute('numFmtId'), template: numFmt.getAttribute('formatCode') }; } // http://www.datypic.com/sc/ooxml/e-ssml_xf-2.html function parseCellStyle(xf, numFmts) { var style = {}; if (xf.hasAttribute('numFmtId')) { var numberFormatId = xf.getAttribute('numFmtId'); if (numFmts[numberFormatId]) { style.numberFormat = numFmts[numberFormatId]; } else { style.numberFormat = { id: numberFormatId }; } } return style; } // I guess `xl/workbook.xml` file should always be present inside the *.xlsx archive. function parseProperties(content, xml) { var book = xml.createDocument(content); // http://webapp.docx4java.org/OnlineDemo/ecma376/SpreadsheetML/workbookPr.html var properties = {}; // https://support.microsoft.com/en-gb/help/214330/differences-between-the-1900-and-the-1904-date-system-in-excel var workbookProperties = xml.select(book, null, '//a:workbookPr', namespaces)[0]; if (workbookProperties && workbookProperties.getAttribute('date1904') === '1') { properties.epoch1904 = true; } // Get sheets info (indexes, names, if they're available). // Example: // <sheets> // <sheet // xmlns:ns="http://schemas.openxmlformats.org/officeDocument/2006/relationships" // name="Sheet1" // sheetId="1" // ns:id="rId3"/> // </sheets> // http://www.datypic.com/sc/ooxml/e-ssml_sheet-1.html properties.sheets = []; var i = 0; for (var _iterator5 = xml.select(book, null, '//a:sheets/a:sheet', namespaces), _isArray5 = Array.isArray(_iterator5), _i5 = 0, _iterator5 = _isArray5 ? _iterator5 : _iterator5[Symbol.iterator]();;) { var _ref8; if (_isArray5) { if (_i5 >= _iterator5.length) break; _ref8 = _iterator5[_i5++]; } else { _i5 = _iterator5.next(); if (_i5.done) break; _ref8 = _i5.value; } var sheet = _ref8; if (sheet.getAttribute('name')) { properties.sheets.push({ id: sheet.getAttribute('sheetId'), name: sheet.getAttribute('name'), relationId: sheet.getAttribute('r:id') }); } i++; } return properties; } /** * Returns sheet file paths. * Seems that the correct place to look for the * `sheetId` -> `filename` mapping seems to be in the * `xl/_rels/workbook.xml.rels` file. * https://github.com/tidyverse/readxl/issues/104 * @param {string} content — `xl/_rels/workbook.xml.rels` file contents. * @param {object} xml * @return {object} */ function parseFileNames(content, xml) { // Example: // <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"> // ... // <Relationship // Id="rId3" // Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" // Target="worksheets/sheet1.xml"/> // </Relationships> var document = xml.createDocument(content); var fileNames = { sheets: {}, sharedStrings: undefined, styles: undefined }; for (var _iterator6 = xml.select(document, null, '/rr:Relationships/rr:Relationship', namespaces), _isArray6 = Array.isArray(_iterator6), _i6 = 0, _iterator6 = _isArray6 ? _iterator6 : _iterator6[Symbol.iterator]();;) { var _ref9; if (_isArray6) { if (_i6 >= _iterator6.length) break; _ref9 = _iterator6[_i6++]; } else { _i6 = _iterator6.next(); if (_i6.done) break; _ref9 = _i6.value; } var relationship = _ref9; var filePath = relationship.getAttribute('Target'); switch (relationship.getAttribute('Type')) { case 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles': fileNames.styles = filePath; break; case 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings': fileNames.sharedStrings = filePath; break; case 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet': fileNames.sheets[relationship.getAttribute('Id')] = filePath; break; } } if (!fileNames.styles) { throw new Error('"styles.xml" file not found in the *.xlsx file'); } if (!fileNames.sharedStrings) { throw new Error('"sharedStrings.xml" file not found in the *.xlsx file'); } return fileNames; } function isDateTemplate(template) { var tokens = template.split(/\W+/); for (var _iterator7 = tokens, _isArray7 = Array.isArray(_iterator7), _i7 = 0, _iterator7 = _isArray7 ? _iterator7 : _iterator7[Symbol.iterator]();;) { var _ref10; if (_isArray7) { if (_i7 >= _iterator7.length) break; _ref10 = _iterator7[_i7++]; } else { _i7 = _iterator7.next(); if (_i7.done) break; _ref10 = _i7.value; } var token = _ref10; if (['MM', 'DD', 'YY', 'YYYY'].indexOf(token) < 0) { return false; } } return true; } function createSheetNotFoundError(sheet, sheets) { var sheetsList = sheets && sheets.map(function (sheet, i) { return '"' + sheet.name + '" (#' + (i + 1) + ')'; }).join(', '); return new Error('Sheet ' + (typeof sheet === 'number' ? '#' + sheet : '"' + sheet + '"') + ' not found in the *.xlsx file.' + (sheets ? ' Available sheets: ' + sheetsList + '.' : '')); } //# sourceMappingURL=readXlsx.js.map