sec-edgar-api
Version:
Fetch and parse SEC earnings reports and other filings. Useful for financial analysis.
431 lines (430 loc) • 21.8 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
var HtmlTableExtractor = /** @class */ (function () {
function HtmlTableExtractor() {
}
HtmlTableExtractor.prototype.extractTables = function (html, options) {
var _a, _b;
var tablesOpen = [];
var tableIndex = -1;
var htmlBefore = '';
var tablesData = [];
for (var i = 0; i < html.length; i++) {
var isTableStart = html.substring(i, i + 6).toLowerCase() === '<table';
var isTableEnd = html.substring(i - 7, i + 1).toLowerCase() === '</table>';
var parentTable = tablesOpen[tablesOpen.length - 1];
if (isTableStart) {
tableIndex++;
tablesOpen.push({
tableIndex: tableIndex,
parentTableIndex: (_a = parentTable === null || parentTable === void 0 ? void 0 : parentTable.tableIndex) !== null && _a !== void 0 ? _a : null,
childTableIndexes: [],
positionStart: i,
positionEnd: -1,
htmlBefore: htmlBefore,
html: '',
rows: [],
});
parentTable === null || parentTable === void 0 ? void 0 : parentTable.childTableIndexes.push(tableIndex);
htmlBefore = '';
}
if (tablesOpen.length === 0) {
htmlBefore += html[i];
}
for (var a = tablesOpen.length - 1; a >= 0; a--) {
tablesOpen[a].html += html[i];
}
if (isTableEnd && tablesOpen.length > 0) {
tablesOpen[tablesOpen.length - 1].positionEnd = i;
var tableData = tablesOpen.pop();
// tablesData[tableData.tableIndex] = tableData
tablesData.push(tableData);
}
}
this.addTableCells(tablesData);
this.addTableCellValues(tablesData, options);
this.addMissingNameCol(tablesData);
if ((_b = options === null || options === void 0 ? void 0 : options.removeEmptyColumns) !== null && _b !== void 0 ? _b : true) {
this.removeEmptyColumns(tablesData);
}
this.mergeHeaderRows(tablesData);
return tablesData.filter(Boolean);
};
HtmlTableExtractor.prototype.mergeHeaderRows = function (tables) {
var _a;
for (var _i = 0, tables_1 = tables; _i < tables_1.length; _i++) {
var table = tables_1[_i];
var bodyRowIndex = table.rows.findIndex(function (row) { return row.some(function (col) { return !col.isHeaderRowCell; }); });
var headerRowIndex = bodyRowIndex - 1;
var bodyRow = table.rows[bodyRowIndex];
var headerRow = table.rows[headerRowIndex];
if (!bodyRow || headerRowIndex < 0)
continue;
for (var i = 0; i < bodyRow.length; i++) {
var headerCol = headerRow[i];
var bodyCol = bodyRow[i];
if (!headerCol || !bodyCol)
continue;
headerCol.valueParsed = (_a = bodyCol.headerCol) !== null && _a !== void 0 ? _a : headerCol === null || headerCol === void 0 ? void 0 : headerCol.valueParsed;
}
}
};
HtmlTableExtractor.prototype.removeEmptyColumns = function (tables) {
var _a;
var _loop_1 = function (table) {
var emptyColumns = new Set();
var _loop_2 = function (c) {
var isAllEmpty = table.rows.every(function (row) { var _a, _b; return ((_a = row[c]) === null || _a === void 0 ? void 0 : _a.valueParsed) === null || ((_b = row[c]) === null || _b === void 0 ? void 0 : _b.isHeaderRowCell); });
if (isAllEmpty) {
emptyColumns.add(c);
}
};
for (var c = 0; c < ((_a = table.rows[0]) === null || _a === void 0 ? void 0 : _a.length); c++) {
_loop_2(c);
}
for (var r = 0; r < table.rows.length; r++) {
var row = table.rows[r];
table.rows[r] = row.filter(function (_, i) { return !emptyColumns.has(i); });
}
table.rows = table.rows.filter(function (row) { return row.some(function (col) { return col.valueParsed !== null; }); });
};
for (var _i = 0, tables_2 = tables; _i < tables_2.length; _i++) {
var table = tables_2[_i];
_loop_1(table);
}
return tables;
};
HtmlTableExtractor.prototype.addTableCells = function (tables) {
if (tables.length === 0)
return;
var tablesByIndex = new Map(tables.map(function (t) { return [t.tableIndex, t]; }));
tablesByIndex.forEach(function (table) {
var _a, _b;
var skipIndexMap = new Map(table.childTableIndexes.map(function (childIndex) {
var _a, _b;
var child = tablesByIndex.get(childIndex);
return [
((_a = child === null || child === void 0 ? void 0 : child.positionStart) !== null && _a !== void 0 ? _a : 0) - table.positionStart,
((_b = child === null || child === void 0 ? void 0 : child.positionEnd) !== null && _b !== void 0 ? _b : 0) - table.positionStart,
];
}));
var grid = [];
var isInCell = false;
var isInCellAtts = false;
var cellAtts = '';
var cellHTML = '';
var rowIndex = -1;
var tableCellIndex = -1;
var createCell = function (html, atts) {
var _a, _b, _c, _d, _e, _f;
var attributePairs = atts
.toLowerCase()
.split(' ')
.map(function (att) { return att.split('='); });
var rowSpan = Number((_b = (_a = attributePairs.find(function (_a) {
var key = _a[0];
return key === 'rowspan';
})) === null || _a === void 0 ? void 0 : _a[1]) === null || _b === void 0 ? void 0 : _b.replace(/[^0-9]/g, '')) || 1;
var colSpan = Number((_d = (_c = attributePairs.find(function (_a) {
var key = _a[0];
return key === 'colspan';
})) === null || _c === void 0 ? void 0 : _c[1]) === null || _d === void 0 ? void 0 : _d.replace(/[^0-9]/g, '')) || 1;
var cell = {
attributes: atts.length > 4 ? atts.substring(4, atts.length - 1) : '',
html: html,
colSpan: colSpan,
rowSpan: rowSpan,
tableCellIndex: tableCellIndex,
rowIndex: rowIndex,
colIndex: -1,
isHeaderRowCell: false,
isBodyTitleRowCell: false,
valueParsed: null,
headerCol: null,
headerRowIndex: null,
};
// const hasCopies = cell.colSpan > 1 || cell.rowSpan > 1
var curRow = (_e = grid[rowIndex]) !== null && _e !== void 0 ? _e : [];
var nextEmptyCellIndex = curRow.findIndex(function (cell) { return !cell; });
var idxStart = nextEmptyCellIndex === -1 ? curRow.length : nextEmptyCellIndex;
for (var r = rowIndex; r < rowIndex + rowSpan; r++) {
grid[r] = (_f = grid[r]) !== null && _f !== void 0 ? _f : [];
for (var c = idxStart; c < idxStart + colSpan; c++) {
cell.colIndex = cell.colIndex > -1 ? cell.colIndex : c;
grid[r][c] = cell;
}
}
};
for (var i = 0; i < table.html.length; i++) {
var skipIndex = (_a = skipIndexMap.get(i)) !== null && _a !== void 0 ? _a : null;
if (skipIndex) {
cellHTML += table.html.substring(i, skipIndex + 1);
i = skipIndex;
continue;
}
var prev5Chars = table.html.substring(i - 4, i + 1).toLowerCase();
var next3Chars = table.html.substring(i, i + 3).toLowerCase();
var isCellAttsStart = ['<td', '<th'].includes(next3Chars);
var isSelfEnclosed = isInCellAtts && table.html[i - 1] === '/' && table.html[i] === '>';
var isCellAttsEnd = (isInCell && table.html[i] === '>') || isSelfEnclosed;
var isCellEnd = ['</td>', '</th>'].includes(prev5Chars);
var isRowStart = next3Chars === '<tr';
if (isRowStart) {
rowIndex++;
grid[rowIndex] = (_b = grid[rowIndex]) !== null && _b !== void 0 ? _b : [];
}
if (isCellAttsStart) {
tableCellIndex++;
isInCell = true;
isInCellAtts = true;
}
if (isInCellAtts) {
cellAtts += table.html[i];
}
if (isInCell) {
cellHTML += table.html[i];
}
if (isCellAttsEnd) {
isInCellAtts = false;
}
if (isCellEnd || isSelfEnclosed) {
isInCell = false;
isInCellAtts = false;
createCell(cellHTML, cellAtts);
cellHTML = '';
cellAtts = '';
}
}
table.rows = grid;
});
};
HtmlTableExtractor.prototype.addMissingNameCol = function (tables) {
var _a, _b;
var _loop_3 = function (table) {
var bodyIndex = table.rows.findIndex(function (row) { return row.some(function (col) { return !col.isHeaderRowCell; }); });
// get the first column index that has a value
var firstPopulatedColIndex = Infinity;
for (var i = bodyIndex; i < table.rows.length; i++) {
var row = table.rows[i];
if (!row)
continue;
var populatedIndex = row.findIndex(function (col) { return col === null || col === void 0 ? void 0 : col.valueParsed; });
var isFirstPopulatedIndex = populatedIndex > -1 && populatedIndex < firstPopulatedColIndex;
if (isFirstPopulatedIndex)
firstPopulatedColIndex = populatedIndex;
if (firstPopulatedColIndex === 0)
break;
}
var shouldAddName = table.rows.some(function (row) {
var firstCol = row[firstPopulatedColIndex];
var headerCol = firstCol === null || firstCol === void 0 ? void 0 : firstCol.headerCol;
// skip if the first column has a header col, or if there is no header row
if (!firstCol || headerCol || firstCol.headerRowIndex === null) {
return false;
}
// if the first col is a string, assume it's a name
return typeof (firstCol === null || firstCol === void 0 ? void 0 : firstCol.valueParsed) === 'string';
});
if (shouldAddName) {
for (var _c = 0, _d = table.rows; _c < _d.length; _c++) {
var row = _d[_c];
var col = row[firstPopulatedColIndex];
if (!col)
continue;
var isEmptyRow = row.every(function (col) { return (col === null || col === void 0 ? void 0 : col.valueParsed) === null; });
// for header rows, add to valueParsed, body rows, set headerCol
if (!isEmptyRow && col.isHeaderRowCell) {
col.valueParsed = (_a = col.valueParsed) !== null && _a !== void 0 ? _a : '[name]';
}
else if (!col.isHeaderRowCell) {
col.headerCol = (_b = col.headerCol) !== null && _b !== void 0 ? _b : '[name]';
}
}
}
};
for (var _i = 0, tables_3 = tables; _i < tables_3.length; _i++) {
var table = tables_3[_i];
_loop_3(table);
}
};
HtmlTableExtractor.prototype.addTableCellValues = function (tables, options) {
var _this = this;
var _a, _b;
var getHeaderRowIndexDefault = function (data) {
var rows = data.rows;
var bodyIndex = rows.findIndex(function (row, r) {
var _a;
var prevRow = (_a = rows[r - 1]) !== null && _a !== void 0 ? _a : [];
var hadUnderlines = prevRow.some(function (col) { return col.attributes.includes('border') && col.attributes.includes('bottom'); });
var hasUnderline = row.some(function (col) { return col.attributes.includes('border') && col.attributes.includes('bottom'); });
if (hadUnderlines && !hasUnderline) {
return true;
}
return row.some(function (col) {
var valueParsed = _this.parseValue(col.html, options);
var isNumber = typeof valueParsed === 'number';
var isYear = isNumber && valueParsed > 1900 && valueParsed < 2100;
var isCol = isNumber && !isYear;
return isCol;
});
});
return bodyIndex - 1;
};
var getHeaderRowIndexCb = (_a = options === null || options === void 0 ? void 0 : options.getHeaderRowIndex) !== null && _a !== void 0 ? _a : getHeaderRowIndexDefault;
var getHeaderRowIndex = function (data) {
return Math.max(getHeaderRowIndexCb(data), -1);
};
var getNextCell = function (row, colIndex) {
var startingCol = row[colIndex];
for (var i = colIndex; i < row.length; i++) {
if (!row[i])
continue;
if (row[i].tableCellIndex !== (startingCol === null || startingCol === void 0 ? void 0 : startingCol.tableCellIndex)) {
return row[i];
}
}
return null;
};
var completedCells = new Set();
var _loop_4 = function (table) {
var headerRowIndex = getHeaderRowIndex({ rows: table.rows, table: table });
var _loop_5 = function (rowIndex) {
var row = table.rows[rowIndex];
if (!row)
return "continue";
var countUniqueCells = new Set(row.map(function (c) { return c.tableCellIndex; })).size;
// skip titles in the middle of the body
var isBodyTitleRow = rowIndex > headerRowIndex && countUniqueCells === 1 && ((_b = row[0]) === null || _b === void 0 ? void 0 : _b.colSpan) > 0;
var isHeaderRow = rowIndex <= headerRowIndex;
var headerByIndex = new Map();
var getHeaderCol = function (c) {
var _a, _b, _c, _d, _e;
if (headerByIndex.has(c)) {
return (_a = headerByIndex.get(c)) !== null && _a !== void 0 ? _a : null;
}
if (isHeaderRow) {
return null;
}
for (var r = 0; r <= headerRowIndex; r++) {
var row_1 = table.rows[r];
if (!row_1)
continue;
for (var c_1 = 0; c_1 < row_1.length; c_1++) {
var col = (_c = (_b = row_1[c_1]) !== null && _b !== void 0 ? _b : row_1[c_1 - 1]) !== null && _c !== void 0 ? _c : row_1[c_1 + 1];
if (!col)
continue;
var headerCurrent = (_d = headerByIndex.get(c_1)) !== null && _d !== void 0 ? _d : '';
var value = headerCurrent.endsWith("".concat(col.valueParsed || ''))
? headerCurrent
: "".concat(headerCurrent, " ").concat(col.valueParsed || '').trim();
headerByIndex.set(c_1, value);
}
}
return (_e = headerByIndex.get(c)) !== null && _e !== void 0 ? _e : null;
};
for (var colIndex = 0; colIndex < row.length; colIndex++) {
var cell = row[colIndex];
if (completedCells.has(cell) || !cell) {
continue;
}
cell.headerRowIndex = headerRowIndex > -1 ? headerRowIndex : null;
cell.isBodyTitleRowCell = isBodyTitleRow;
cell.isHeaderRowCell = isHeaderRow;
// sometimes there is a rogue percent sign that is not in a column, so we need to check the next column
var nextCell = getNextCell(row, colIndex);
// const isMissingPercentSign =
// nextCell?.html.includes('%') && this.parseValue(nextCell?.html) === null
var isMissingParenthesis = (nextCell === null || nextCell === void 0 ? void 0 : nextCell.html.includes(')')) && cell.html.includes('(') && !cell.html.includes(')');
var colValue = isMissingParenthesis ? "".concat(cell.html.trim(), ")") : cell.html.trim();
// colValue = isMissingPercentSign ? `${colValue}` : colValue
colValue = this_1.parseValue(colValue, options);
colValue = typeof colValue === 'string' ? colValue.replace(/\s+/g, ' ') : colValue;
// add parsed value
cell.valueParsed = colValue;
cell.headerCol = getHeaderCol(colIndex);
completedCells.add(cell);
}
};
for (var rowIndex = 0; rowIndex < table.rows.length; rowIndex++) {
_loop_5(rowIndex);
}
};
var this_1 = this;
for (var _i = 0, tables_4 = tables; _i < tables_4.length; _i++) {
var table = tables_4[_i];
_loop_4(table);
}
};
HtmlTableExtractor.prototype.stripHtml = function (str, options) {
var _a = (options !== null && options !== void 0 ? options : {}).tagsToExclude, tagsToExclude = _a === void 0 ? [] : _a;
var strNew = str;
if (tagsToExclude.length > 0) {
strNew = '';
var _loop_6 = function (i) {
var char = str[i];
if (char !== '<') {
strNew += char;
return out_i_1 = i, "continue";
}
var matchedTag = tagsToExclude.find(function (tag) { return str.substring(i, i + tag.length + 1).toLowerCase() === "<".concat(tag); });
if (!matchedTag) {
strNew += char;
return out_i_1 = i, "continue";
}
var endTag = "</".concat(matchedTag, ">");
var endTagIndex = str.indexOf(endTag, i);
if (endTagIndex > -1) {
i = endTagIndex + endTag.length - 1;
}
out_i_1 = i;
};
var out_i_1;
for (var i = 0; i < str.length; i++) {
_loop_6(i);
i = out_i_1;
}
}
return strNew.replace(/<.*?>/gm, '');
};
HtmlTableExtractor.prototype.parseValue = function (str, options) {
var _a;
if (str === null)
return null;
if (typeof str === 'number')
return str;
var _b = options !== null && options !== void 0 ? options : {}, _c = _b.stripHtml, stripHtml = _c === void 0 ? true : _c, _d = _b.tagsToExclude, tagsToExclude = _d === void 0 ? [] : _d, _e = _b.stripParenthesis, stripParenthesis = _e === void 0 ? false : _e;
var strNew = stripHtml ? this.stripHtml(str, { tagsToExclude: tagsToExclude }) : str;
var text = strNew
.replace(/ | |\n/g, ' ')
.replace(/®|☐/g, '')
.replace(/–|—|—|—/g, '-')
.replace(/’|“|”|’/g, "'");
if (stripParenthesis) {
text = text.replace(/\(.*?\)/g, '');
}
text = text
.replace(/\s+/, ' ')
.replace(/&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});/g, ' ')
.trim();
if (str.replace(/–|—|—/g, '-') === '-')
return '-';
if (text === '')
return null;
var colNum = text.replace(/,|\(|\)|%/g, '').trim();
if (colNum === '-' || colNum === '$')
return null;
colNum = colNum.replace(/-|\$/g, '');
var hasNumBeforeParenthesis = Boolean(/\d+\s*(?=\()/.test(text));
colNum = hasNumBeforeParenthesis ? (_a = colNum.split(' ')[0]) === null || _a === void 0 ? void 0 : _a.trim() : colNum;
if (!isNaN(Number(colNum))) {
if (text.includes('%'))
return text.replace(/[^a-zA-Z\d\s:]/g, '') === '' ? null : text;
return (text.trim().includes('(') && !hasNumBeforeParenthesis) || text.includes('-')
? Number(colNum) * -1
: Number(colNum);
}
return text;
};
return HtmlTableExtractor;
}());
exports.default = HtmlTableExtractor;