sec-edgar-api
Version:
Fetch and parse SEC earnings reports and other filings. Useful for financial analysis.
309 lines (308 loc) • 14.9 kB
JavaScript
"use strict";
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
Object.defineProperty(exports, "__esModule", { value: true });
var ColNode_1 = require("./XMLNode/ColNode");
var DocumentNode_1 = require("./XMLNode/DocumentNode");
var HRNode_1 = require("./XMLNode/HRNode");
var NonTableNode_1 = require("./XMLNode/NonTableNode");
var RowNode_1 = require("./XMLNode/RowNode");
var TableNode_1 = require("./XMLNode/TableNode");
/**
* @deprecated use XMLParser
*/
var XMLParserLegacy = /** @class */ (function () {
function XMLParserLegacy() {
}
XMLParserLegacy.prototype.iterateXML = function (params) {
var _a, _b, _c;
var xml = params.xml, onCharacter = params.onCharacter, onCloseTag = params.onCloseTag, onOpenTag = params.onOpenTag;
var selfEnclosingTags = new Set([
'filename',
'description',
'br',
'meta',
'link',
'img',
'input',
'hr',
'area',
'base',
'col',
'command',
'embed',
'keygen',
'param',
'source',
'track',
'wbr',
]);
var spaceChars = new Set(['\n', '\r', '\t', ' ']);
var pathOccurrenceCountMap = new Map();
var curPath = '';
var curTag = '';
var curAttributes = '';
var didStart = false;
var pathsArr = [];
for (var i = 0; i < xml.length; i++) {
var char = xml[i];
var isOpenTag = char === '<' && xml[i + 1] !== '/' && xml[i + 1] !== '?' && xml[i + 1] !== '!';
var isCloseTag = char === '<' && xml[i + 1] === '/';
var onCharacterData = {
char: char,
index: i,
path: curPath,
pathOccurrenceCount: (_a = pathOccurrenceCountMap.get(curPath)) !== null && _a !== void 0 ? _a : 0,
attributesStr: curAttributes,
};
if (isOpenTag) {
var didEndTagName = false;
var j = 0;
didStart = true;
i++;
while (xml[i] !== '>') {
didEndTagName = didEndTagName || spaceChars.has(xml[i]) || xml[i] === '/';
if (!didEndTagName) {
curTag += xml[i].toLowerCase();
}
else if (xml[i] !== '/') {
curAttributes += xml[i];
}
i++;
j++;
if (j > 1000000) {
throw new Error('too many iterations');
}
}
var pathNew = "".concat(curPath).concat(curPath.length > 0 ? '.' : '').concat(curTag).toLowerCase();
var countBefore = (_b = pathOccurrenceCountMap.get(pathNew)) !== null && _b !== void 0 ? _b : 0;
var pathOccurrenceCount = (_c = pathOccurrenceCountMap.set(pathNew, countBefore + 1).get(pathNew)) !== null && _c !== void 0 ? _c : 0;
onCharacterData.path = pathNew;
onCharacterData.pathOccurrenceCount = pathOccurrenceCount;
onCharacterData.attributesStr = curAttributes;
pathsArr.push(pathNew);
onOpenTag === null || onOpenTag === void 0 ? void 0 : onOpenTag(onCharacterData);
if (selfEnclosingTags.has(curTag)) {
onCloseTag === null || onCloseTag === void 0 ? void 0 : onCloseTag(onCharacterData);
}
else {
curPath = pathNew;
}
curTag = '';
}
else if (isCloseTag) {
while (xml[i] !== '>') {
i++;
}
onCloseTag === null || onCloseTag === void 0 ? void 0 : onCloseTag(onCharacterData);
curPath = curPath.slice(0, curPath.lastIndexOf('.'));
curAttributes = '';
}
else if (didStart) {
onCharacter === null || onCharacter === void 0 ? void 0 : onCharacter(onCharacterData);
}
}
return pathsArr;
};
/**
* Returns text in each table cell mapped by `${table}.${row}.${col}`
*/
XMLParserLegacy.prototype.getTableTextMap = function (params) {
var xml = params.xml, parentPath = params.parentPath, onCharacter = params.onCharacter, onCloseTag = params.onCloseTag, onOpenTag = params.onOpenTag, _a = params.trimSpaces, trimSpaces = _a === void 0 ? true : _a;
var rowPaths = new Set([
"".concat(parentPath, ".table.tbody.tr"),
"".concat(parentPath, ".table.thead.tr"),
"".concat(parentPath, ".table.tfoot.tr"),
"".concat(parentPath, ".table.tr"),
]);
var colPaths = new Set([
"".concat(parentPath, ".table.tbody.tr.td"),
"".concat(parentPath, ".table.thead.tr.td"),
"".concat(parentPath, ".table.tfoot.tr.td"),
"".concat(parentPath, ".table.tr.td"),
"".concat(parentPath, ".table.tbody.tr.th"),
"".concat(parentPath, ".table.thead.tr.th"),
"".concat(parentPath, ".table.tfoot.tr.th"),
"".concat(parentPath, ".table.tr.th"),
]);
var table = 0;
var row = 0;
var col = 0;
var textByColKey = new Map();
var spaceChars = new Set(['\n', '\r', '\t']);
this.iterateXML({
xml: xml,
onOpenTag: function (data) {
var _a;
var path = data.path;
var colKey = "".concat(table, ".").concat(row, ".").concat(col);
var textCur = (_a = textByColKey.get(colKey)) !== null && _a !== void 0 ? _a : '';
var pathLower = path.toLowerCase();
if (textCur.trim().length === 0 && col === 0) {
textByColKey.delete(colKey);
}
var isTable = parentPath ? pathLower === "".concat(parentPath, ".table") : pathLower.endsWith('table');
var isRow = parentPath ? rowPaths.has(pathLower) : pathLower.endsWith('tr');
var isCol = parentPath
? colPaths.has(pathLower)
: pathLower.endsWith('td') || pathLower.endsWith('th');
if (isTable) {
table++;
col = 0;
row = 0;
}
else if (isRow) {
row++;
col = 0;
}
else if (isCol) {
col++;
}
onOpenTag === null || onOpenTag === void 0 ? void 0 : onOpenTag(__assign(__assign({}, data), { textMap: textByColKey }));
},
onCharacter: function (data) {
var _a;
var char = spaceChars.has(data.char) ? ' ' : data.char;
var colKey = "".concat(table, ".").concat(row, ".").concat(col);
var textCur = (_a = textByColKey.get(colKey)) !== null && _a !== void 0 ? _a : '';
if (!(trimSpaces && char === ' ' && textCur.endsWith(' '))) {
textByColKey.set(colKey, "".concat(textCur).concat(char));
}
onCharacter === null || onCharacter === void 0 ? void 0 : onCharacter(__assign(__assign({}, data), { textMap: textByColKey }));
},
onCloseTag: function (data) {
var _a;
var colKey = "".concat(table, ".").concat(row, ".").concat(col);
var textCur = (_a = textByColKey.get(colKey)) !== null && _a !== void 0 ? _a : '';
if (textCur.trim().length === 0 && col === 0) {
textByColKey.delete(colKey);
}
else if (!textCur.endsWith(' ')) {
textByColKey.set(colKey, "".concat(textCur, " "));
}
onCloseTag === null || onCloseTag === void 0 ? void 0 : onCloseTag(__assign(__assign({}, data), { textMap: textByColKey }));
},
});
return textByColKey;
};
XMLParserLegacy.prototype.getDocumentNode = function (params) {
var xml = params.xml;
var rowsArr = [];
var colsArr = [];
var documentNode = new DocumentNode_1.DocumentNode();
var curNode = null;
var prevRowCols = [];
var curRowCols = [];
var isBold = false;
var boldPath = null;
var pushColToRow = function (col) {
var _a;
var colIndex = curRowCols.length;
col.setIndex(colIndex);
var colSpan = col.getColSpan();
Array.from({ length: colSpan }).forEach(function () { return curRowCols.push(col); });
var topSibling = (_a = prevRowCols[colIndex]) !== null && _a !== void 0 ? _a : null;
topSibling === null || topSibling === void 0 ? void 0 : topSibling.addBottomSibling(col);
};
this.iterateXML({
xml: xml,
onCloseTag: function () {
var _a;
if ((curNode === null || curNode === void 0 ? void 0 : curNode.getPath()) === boldPath) {
curNode === null || curNode === void 0 ? void 0 : curNode.setText("".concat((_a = curNode === null || curNode === void 0 ? void 0 : curNode.getText()) !== null && _a !== void 0 ? _a : '', "}}"));
boldPath = null;
}
},
onCharacter: function (_a) {
var _b;
var char = _a.char;
curNode === null || curNode === void 0 ? void 0 : curNode.setText(((_b = curNode === null || curNode === void 0 ? void 0 : curNode.getText()) !== null && _b !== void 0 ? _b : '') + char);
},
onOpenTag: function (_a) {
var _b;
var path = _a.path, attributesStr = _a.attributesStr;
// skip nested tables
if (path.split('.').reduce(function (acc, cur) { return (cur === 'table' ? acc + 1 : acc); }, 0) > 1)
return;
var tag = path.split('.').pop();
var isInTable = path.includes('table');
var topLevelNodes = documentNode.getChildren();
var prevTopLevelNode = topLevelNodes[topLevelNodes.length - 1];
var wasHorizontalLine = prevTopLevelNode instanceof HRNode_1.HRNode;
var wasNonTableNode = prevTopLevelNode instanceof NonTableNode_1.NonTableNode;
var wasBold = isBold;
var attributesLower = attributesStr.toLowerCase().replace(/\s/g, '');
isBold =
tag === 'b' ||
tag === 'strong' ||
attributesLower.includes('font-weight:bold') ||
attributesLower.includes('font-weight:700') ||
attributesLower.includes('font-weight:800') ||
attributesLower.includes('font-weight:900');
if (!isInTable) {
prevRowCols = [];
curRowCols = [];
}
if (tag === 'hr' && !isInTable) {
var hr = new HRNode_1.HRNode({ attributesStr: attributesStr, path: path });
hr.setPreviousSibling(prevTopLevelNode !== null && prevTopLevelNode !== void 0 ? prevTopLevelNode : null);
topLevelNodes.push(hr);
curNode = hr;
}
else if (tag === 'table') {
var table = new TableNode_1.TableNode({ attributesStr: attributesStr, path: path });
table.setPreviousSibling(prevTopLevelNode !== null && prevTopLevelNode !== void 0 ? prevTopLevelNode : null);
topLevelNodes.push(table);
curNode = table;
}
else if (tag === 'tr') {
var row = new RowNode_1.RowNode({ attributesStr: attributesStr, path: path });
var prevRow = rowsArr[rowsArr.length - 1];
row.setParent(prevTopLevelNode);
row.setPreviousSibling((prevRow === null || prevRow === void 0 ? void 0 : prevRow.getParent()) === row.getParent() ? prevRow : null);
rowsArr.push(row);
prevRowCols = curRowCols;
curRowCols = [];
curNode = row;
}
else if (tag === 'td' || tag === 'th') {
var col = new ColNode_1.ColNode({ attributesStr: attributesStr, path: path });
var prevCol = colsArr[colsArr.length - 1];
col.setParent(rowsArr[rowsArr.length - 1]);
col.setPreviousSibling((prevCol === null || prevCol === void 0 ? void 0 : prevCol.getParent()) === col.getParent() ? prevCol : null);
colsArr.push(col);
pushColToRow(col);
curNode = col;
}
else if ((!isInTable && !wasNonTableNode) || (wasHorizontalLine && tag !== 'hr')) {
var node = new NonTableNode_1.NonTableNode({ attributesStr: attributesStr, path: path });
node.setPreviousSibling(prevTopLevelNode !== null && prevTopLevelNode !== void 0 ? prevTopLevelNode : null);
topLevelNodes.push(node);
curNode = node;
}
else if (curNode && !curNode.getText().endsWith('\n')) {
curNode.setText("".concat(curNode.getText().trim(), "\n"));
}
if (isBold && !wasBold && !(curNode === null || curNode === void 0 ? void 0 : curNode.getText().endsWith('{{'))) {
curNode === null || curNode === void 0 ? void 0 : curNode.setText("".concat(curNode === null || curNode === void 0 ? void 0 : curNode.getText().trim(), "{{"));
}
if (isBold) {
boldPath = (_b = curNode === null || curNode === void 0 ? void 0 : curNode.getPath()) !== null && _b !== void 0 ? _b : null;
}
},
});
documentNode.setText(xml);
return documentNode;
};
return XMLParserLegacy;
}());
exports.default = XMLParserLegacy;