UNPKG

sec-edgar-api

Version:

Fetch and parse SEC earnings reports and other filings. Useful for financial analysis.

431 lines (430 loc) 21.8 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var HtmlTableExtractor = /** @class */ (function () { function HtmlTableExtractor() { } HtmlTableExtractor.prototype.extractTables = function (html, options) { var _a, _b; var tablesOpen = []; var tableIndex = -1; var htmlBefore = ''; var tablesData = []; for (var i = 0; i < html.length; i++) { var isTableStart = html.substring(i, i + 6).toLowerCase() === '<table'; var isTableEnd = html.substring(i - 7, i + 1).toLowerCase() === '</table>'; var parentTable = tablesOpen[tablesOpen.length - 1]; if (isTableStart) { tableIndex++; tablesOpen.push({ tableIndex: tableIndex, parentTableIndex: (_a = parentTable === null || parentTable === void 0 ? void 0 : parentTable.tableIndex) !== null && _a !== void 0 ? _a : null, childTableIndexes: [], positionStart: i, positionEnd: -1, htmlBefore: htmlBefore, html: '', rows: [], }); parentTable === null || parentTable === void 0 ? void 0 : parentTable.childTableIndexes.push(tableIndex); htmlBefore = ''; } if (tablesOpen.length === 0) { htmlBefore += html[i]; } for (var a = tablesOpen.length - 1; a >= 0; a--) { tablesOpen[a].html += html[i]; } if (isTableEnd && tablesOpen.length > 0) { tablesOpen[tablesOpen.length - 1].positionEnd = i; var tableData = tablesOpen.pop(); // tablesData[tableData.tableIndex] = tableData tablesData.push(tableData); } } this.addTableCells(tablesData); this.addTableCellValues(tablesData, options); this.addMissingNameCol(tablesData); if ((_b = options === null || options === void 0 ? void 0 : options.removeEmptyColumns) !== null && _b !== void 0 ? _b : true) { this.removeEmptyColumns(tablesData); } this.mergeHeaderRows(tablesData); return tablesData.filter(Boolean); }; HtmlTableExtractor.prototype.mergeHeaderRows = function (tables) { var _a; for (var _i = 0, tables_1 = tables; _i < tables_1.length; _i++) { var table = tables_1[_i]; var bodyRowIndex = table.rows.findIndex(function (row) { return row.some(function (col) { return !col.isHeaderRowCell; }); }); var headerRowIndex = bodyRowIndex - 1; var bodyRow = table.rows[bodyRowIndex]; var headerRow = table.rows[headerRowIndex]; if (!bodyRow || headerRowIndex < 0) continue; for (var i = 0; i < bodyRow.length; i++) { var headerCol = headerRow[i]; var bodyCol = bodyRow[i]; if (!headerCol || !bodyCol) continue; headerCol.valueParsed = (_a = bodyCol.headerCol) !== null && _a !== void 0 ? _a : headerCol === null || headerCol === void 0 ? void 0 : headerCol.valueParsed; } } }; HtmlTableExtractor.prototype.removeEmptyColumns = function (tables) { var _a; var _loop_1 = function (table) { var emptyColumns = new Set(); var _loop_2 = function (c) { var isAllEmpty = table.rows.every(function (row) { var _a, _b; return ((_a = row[c]) === null || _a === void 0 ? void 0 : _a.valueParsed) === null || ((_b = row[c]) === null || _b === void 0 ? void 0 : _b.isHeaderRowCell); }); if (isAllEmpty) { emptyColumns.add(c); } }; for (var c = 0; c < ((_a = table.rows[0]) === null || _a === void 0 ? void 0 : _a.length); c++) { _loop_2(c); } for (var r = 0; r < table.rows.length; r++) { var row = table.rows[r]; table.rows[r] = row.filter(function (_, i) { return !emptyColumns.has(i); }); } table.rows = table.rows.filter(function (row) { return row.some(function (col) { return col.valueParsed !== null; }); }); }; for (var _i = 0, tables_2 = tables; _i < tables_2.length; _i++) { var table = tables_2[_i]; _loop_1(table); } return tables; }; HtmlTableExtractor.prototype.addTableCells = function (tables) { if (tables.length === 0) return; var tablesByIndex = new Map(tables.map(function (t) { return [t.tableIndex, t]; })); tablesByIndex.forEach(function (table) { var _a, _b; var skipIndexMap = new Map(table.childTableIndexes.map(function (childIndex) { var _a, _b; var child = tablesByIndex.get(childIndex); return [ ((_a = child === null || child === void 0 ? void 0 : child.positionStart) !== null && _a !== void 0 ? _a : 0) - table.positionStart, ((_b = child === null || child === void 0 ? void 0 : child.positionEnd) !== null && _b !== void 0 ? _b : 0) - table.positionStart, ]; })); var grid = []; var isInCell = false; var isInCellAtts = false; var cellAtts = ''; var cellHTML = ''; var rowIndex = -1; var tableCellIndex = -1; var createCell = function (html, atts) { var _a, _b, _c, _d, _e, _f; var attributePairs = atts .toLowerCase() .split(' ') .map(function (att) { return att.split('='); }); var rowSpan = Number((_b = (_a = attributePairs.find(function (_a) { var key = _a[0]; return key === 'rowspan'; })) === null || _a === void 0 ? void 0 : _a[1]) === null || _b === void 0 ? void 0 : _b.replace(/[^0-9]/g, '')) || 1; var colSpan = Number((_d = (_c = attributePairs.find(function (_a) { var key = _a[0]; return key === 'colspan'; })) === null || _c === void 0 ? void 0 : _c[1]) === null || _d === void 0 ? void 0 : _d.replace(/[^0-9]/g, '')) || 1; var cell = { attributes: atts.length > 4 ? atts.substring(4, atts.length - 1) : '', html: html, colSpan: colSpan, rowSpan: rowSpan, tableCellIndex: tableCellIndex, rowIndex: rowIndex, colIndex: -1, isHeaderRowCell: false, isBodyTitleRowCell: false, valueParsed: null, headerCol: null, headerRowIndex: null, }; // const hasCopies = cell.colSpan > 1 || cell.rowSpan > 1 var curRow = (_e = grid[rowIndex]) !== null && _e !== void 0 ? _e : []; var nextEmptyCellIndex = curRow.findIndex(function (cell) { return !cell; }); var idxStart = nextEmptyCellIndex === -1 ? curRow.length : nextEmptyCellIndex; for (var r = rowIndex; r < rowIndex + rowSpan; r++) { grid[r] = (_f = grid[r]) !== null && _f !== void 0 ? _f : []; for (var c = idxStart; c < idxStart + colSpan; c++) { cell.colIndex = cell.colIndex > -1 ? cell.colIndex : c; grid[r][c] = cell; } } }; for (var i = 0; i < table.html.length; i++) { var skipIndex = (_a = skipIndexMap.get(i)) !== null && _a !== void 0 ? _a : null; if (skipIndex) { cellHTML += table.html.substring(i, skipIndex + 1); i = skipIndex; continue; } var prev5Chars = table.html.substring(i - 4, i + 1).toLowerCase(); var next3Chars = table.html.substring(i, i + 3).toLowerCase(); var isCellAttsStart = ['<td', '<th'].includes(next3Chars); var isSelfEnclosed = isInCellAtts && table.html[i - 1] === '/' && table.html[i] === '>'; var isCellAttsEnd = (isInCell && table.html[i] === '>') || isSelfEnclosed; var isCellEnd = ['</td>', '</th>'].includes(prev5Chars); var isRowStart = next3Chars === '<tr'; if (isRowStart) { rowIndex++; grid[rowIndex] = (_b = grid[rowIndex]) !== null && _b !== void 0 ? _b : []; } if (isCellAttsStart) { tableCellIndex++; isInCell = true; isInCellAtts = true; } if (isInCellAtts) { cellAtts += table.html[i]; } if (isInCell) { cellHTML += table.html[i]; } if (isCellAttsEnd) { isInCellAtts = false; } if (isCellEnd || isSelfEnclosed) { isInCell = false; isInCellAtts = false; createCell(cellHTML, cellAtts); cellHTML = ''; cellAtts = ''; } } table.rows = grid; }); }; HtmlTableExtractor.prototype.addMissingNameCol = function (tables) { var _a, _b; var _loop_3 = function (table) { var bodyIndex = table.rows.findIndex(function (row) { return row.some(function (col) { return !col.isHeaderRowCell; }); }); // get the first column index that has a value var firstPopulatedColIndex = Infinity; for (var i = bodyIndex; i < table.rows.length; i++) { var row = table.rows[i]; if (!row) continue; var populatedIndex = row.findIndex(function (col) { return col === null || col === void 0 ? void 0 : col.valueParsed; }); var isFirstPopulatedIndex = populatedIndex > -1 && populatedIndex < firstPopulatedColIndex; if (isFirstPopulatedIndex) firstPopulatedColIndex = populatedIndex; if (firstPopulatedColIndex === 0) break; } var shouldAddName = table.rows.some(function (row) { var firstCol = row[firstPopulatedColIndex]; var headerCol = firstCol === null || firstCol === void 0 ? void 0 : firstCol.headerCol; // skip if the first column has a header col, or if there is no header row if (!firstCol || headerCol || firstCol.headerRowIndex === null) { return false; } // if the first col is a string, assume it's a name return typeof (firstCol === null || firstCol === void 0 ? void 0 : firstCol.valueParsed) === 'string'; }); if (shouldAddName) { for (var _c = 0, _d = table.rows; _c < _d.length; _c++) { var row = _d[_c]; var col = row[firstPopulatedColIndex]; if (!col) continue; var isEmptyRow = row.every(function (col) { return (col === null || col === void 0 ? void 0 : col.valueParsed) === null; }); // for header rows, add to valueParsed, body rows, set headerCol if (!isEmptyRow && col.isHeaderRowCell) { col.valueParsed = (_a = col.valueParsed) !== null && _a !== void 0 ? _a : '[name]'; } else if (!col.isHeaderRowCell) { col.headerCol = (_b = col.headerCol) !== null && _b !== void 0 ? _b : '[name]'; } } } }; for (var _i = 0, tables_3 = tables; _i < tables_3.length; _i++) { var table = tables_3[_i]; _loop_3(table); } }; HtmlTableExtractor.prototype.addTableCellValues = function (tables, options) { var _this = this; var _a, _b; var getHeaderRowIndexDefault = function (data) { var rows = data.rows; var bodyIndex = rows.findIndex(function (row, r) { var _a; var prevRow = (_a = rows[r - 1]) !== null && _a !== void 0 ? _a : []; var hadUnderlines = prevRow.some(function (col) { return col.attributes.includes('border') && col.attributes.includes('bottom'); }); var hasUnderline = row.some(function (col) { return col.attributes.includes('border') && col.attributes.includes('bottom'); }); if (hadUnderlines && !hasUnderline) { return true; } return row.some(function (col) { var valueParsed = _this.parseValue(col.html, options); var isNumber = typeof valueParsed === 'number'; var isYear = isNumber && valueParsed > 1900 && valueParsed < 2100; var isCol = isNumber && !isYear; return isCol; }); }); return bodyIndex - 1; }; var getHeaderRowIndexCb = (_a = options === null || options === void 0 ? void 0 : options.getHeaderRowIndex) !== null && _a !== void 0 ? _a : getHeaderRowIndexDefault; var getHeaderRowIndex = function (data) { return Math.max(getHeaderRowIndexCb(data), -1); }; var getNextCell = function (row, colIndex) { var startingCol = row[colIndex]; for (var i = colIndex; i < row.length; i++) { if (!row[i]) continue; if (row[i].tableCellIndex !== (startingCol === null || startingCol === void 0 ? void 0 : startingCol.tableCellIndex)) { return row[i]; } } return null; }; var completedCells = new Set(); var _loop_4 = function (table) { var headerRowIndex = getHeaderRowIndex({ rows: table.rows, table: table }); var _loop_5 = function (rowIndex) { var row = table.rows[rowIndex]; if (!row) return "continue"; var countUniqueCells = new Set(row.map(function (c) { return c.tableCellIndex; })).size; // skip titles in the middle of the body var isBodyTitleRow = rowIndex > headerRowIndex && countUniqueCells === 1 && ((_b = row[0]) === null || _b === void 0 ? void 0 : _b.colSpan) > 0; var isHeaderRow = rowIndex <= headerRowIndex; var headerByIndex = new Map(); var getHeaderCol = function (c) { var _a, _b, _c, _d, _e; if (headerByIndex.has(c)) { return (_a = headerByIndex.get(c)) !== null && _a !== void 0 ? _a : null; } if (isHeaderRow) { return null; } for (var r = 0; r <= headerRowIndex; r++) { var row_1 = table.rows[r]; if (!row_1) continue; for (var c_1 = 0; c_1 < row_1.length; c_1++) { var col = (_c = (_b = row_1[c_1]) !== null && _b !== void 0 ? _b : row_1[c_1 - 1]) !== null && _c !== void 0 ? _c : row_1[c_1 + 1]; if (!col) continue; var headerCurrent = (_d = headerByIndex.get(c_1)) !== null && _d !== void 0 ? _d : ''; var value = headerCurrent.endsWith("".concat(col.valueParsed || '')) ? headerCurrent : "".concat(headerCurrent, " ").concat(col.valueParsed || '').trim(); headerByIndex.set(c_1, value); } } return (_e = headerByIndex.get(c)) !== null && _e !== void 0 ? _e : null; }; for (var colIndex = 0; colIndex < row.length; colIndex++) { var cell = row[colIndex]; if (completedCells.has(cell) || !cell) { continue; } cell.headerRowIndex = headerRowIndex > -1 ? headerRowIndex : null; cell.isBodyTitleRowCell = isBodyTitleRow; cell.isHeaderRowCell = isHeaderRow; // sometimes there is a rogue percent sign that is not in a column, so we need to check the next column var nextCell = getNextCell(row, colIndex); // const isMissingPercentSign = // nextCell?.html.includes('%') && this.parseValue(nextCell?.html) === null var isMissingParenthesis = (nextCell === null || nextCell === void 0 ? void 0 : nextCell.html.includes(')')) && cell.html.includes('(') && !cell.html.includes(')'); var colValue = isMissingParenthesis ? "".concat(cell.html.trim(), ")") : cell.html.trim(); // colValue = isMissingPercentSign ? `${colValue}` : colValue colValue = this_1.parseValue(colValue, options); colValue = typeof colValue === 'string' ? colValue.replace(/\s+/g, ' ') : colValue; // add parsed value cell.valueParsed = colValue; cell.headerCol = getHeaderCol(colIndex); completedCells.add(cell); } }; for (var rowIndex = 0; rowIndex < table.rows.length; rowIndex++) { _loop_5(rowIndex); } }; var this_1 = this; for (var _i = 0, tables_4 = tables; _i < tables_4.length; _i++) { var table = tables_4[_i]; _loop_4(table); } }; HtmlTableExtractor.prototype.stripHtml = function (str, options) { var _a = (options !== null && options !== void 0 ? options : {}).tagsToExclude, tagsToExclude = _a === void 0 ? [] : _a; var strNew = str; if (tagsToExclude.length > 0) { strNew = ''; var _loop_6 = function (i) { var char = str[i]; if (char !== '<') { strNew += char; return out_i_1 = i, "continue"; } var matchedTag = tagsToExclude.find(function (tag) { return str.substring(i, i + tag.length + 1).toLowerCase() === "<".concat(tag); }); if (!matchedTag) { strNew += char; return out_i_1 = i, "continue"; } var endTag = "</".concat(matchedTag, ">"); var endTagIndex = str.indexOf(endTag, i); if (endTagIndex > -1) { i = endTagIndex + endTag.length - 1; } out_i_1 = i; }; var out_i_1; for (var i = 0; i < str.length; i++) { _loop_6(i); i = out_i_1; } } return strNew.replace(/<.*?>/gm, ''); }; HtmlTableExtractor.prototype.parseValue = function (str, options) { var _a; if (str === null) return null; if (typeof str === 'number') return str; var _b = options !== null && options !== void 0 ? options : {}, _c = _b.stripHtml, stripHtml = _c === void 0 ? true : _c, _d = _b.tagsToExclude, tagsToExclude = _d === void 0 ? [] : _d, _e = _b.stripParenthesis, stripParenthesis = _e === void 0 ? false : _e; var strNew = stripHtml ? this.stripHtml(str, { tagsToExclude: tagsToExclude }) : str; var text = strNew .replace(/&#160;|&nbsp;|\n/g, ' ') .replace(/&#174;|&#9744;/g, '') .replace(/&#8211;|&#8212;|&#x2014;|&#151;/g, '-') .replace(/&#8217;|&#8220;|&#8221;|&rsquo;/g, "'"); if (stripParenthesis) { text = text.replace(/\(.*?\)/g, ''); } text = text .replace(/\s+/, ' ') .replace(/&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});/g, ' ') .trim(); if (str.replace(/&#8211;|&#8212;|&#x2014;/g, '-') === '-') return '-'; if (text === '') return null; var colNum = text.replace(/,|\(|\)|%/g, '').trim(); if (colNum === '-' || colNum === '$') return null; colNum = colNum.replace(/-|\$/g, ''); var hasNumBeforeParenthesis = Boolean(/\d+\s*(?=\()/.test(text)); colNum = hasNumBeforeParenthesis ? (_a = colNum.split(' ')[0]) === null || _a === void 0 ? void 0 : _a.trim() : colNum; if (!isNaN(Number(colNum))) { if (text.includes('%')) return text.replace(/[^a-zA-Z\d\s:]/g, '') === '' ? null : text; return (text.trim().includes('(') && !hasNumBeforeParenthesis) || text.includes('-') ? Number(colNum) * -1 : Number(colNum); } return text; }; return HtmlTableExtractor; }()); exports.default = HtmlTableExtractor;