UNPKG

sec-edgar-parser

Version:

A tool for scraping SEC edgar archive documents found at https://www.sec.gov/Archives/edgar/data

291 lines (290 loc) 11.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.toCamelCase = toCamelCase; exports.parseYamlLikeString = parseYamlLikeString; exports.badXmlToObj = badXmlToObj; exports.trimDocument = trimDocument; exports.getObjectFromUrl = getObjectFromUrl; exports.getObjectFromString = getObjectFromString; const tslib_1 = require("tslib"); const yaml = tslib_1.__importStar(require("yaml")); const { XMLParser } = require("fast-xml-parser"); // Function to convert a string to camelCase function toCamelCase(str) { // Split the string into words based on spaces and hyphens const words = str.split(/[\s-]+/); // Map over the words array and convert each word segment to camelCase const camelCaseString = words .map((word, index) => { // Capitalize the first letter for all word segments except the first one if (index === 0) { return word.toLowerCase(); // Convert the first word segment to lowercase } // Capitalize the first letter and convert the rest to lowercase for subsequent segments return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase(); }) .join(""); // Join the words together to form the camelCase string // Return the camelCase string return camelCaseString; } const spaceDepth = (line) => { var _a; return (((_a = line.match(/^ */)) === null || _a === void 0 ? void 0 : _a[0]) || "").length; }; const replaceCharAtIndex = (str, index, newChar) => { return str.substring(0, index) + newChar + str.substring(index + 1); }; const removeNSpaces = (str, N) => { let copy = str; for (let index = 0; index < N; index++) { const char = copy[index]; if (char === " ") { copy = replaceCharAtIndex(copy, index, ""); } } return copy; }; const tabsToSpaces = (str) => { const leadingTabs = str.match(/^\t+/); if (!leadingTabs) return str; const numSpaces = 2 * leadingTabs[0].length; const spaces = " ".repeat(numSpaces); return spaces + str.replace(/^\t+/, ""); }; function badYamlToObj(text) { const lines = text.split("\n").map(tabsToSpaces); const depthOfFirstLine = spaceDepth(lines[0]); let normalizedYaml = ""; let i = 0; for (const line of lines) { let cleaned = removeNSpaces(line, depthOfFirstLine); const [key, val] = line.split(":"); if (key.trim()) { let cleanVal = val.trim().replace(/'/g, "''"); if (cleanVal.trim() === "") { cleanVal = ""; } else { cleanVal = ` '${cleanVal}'`; } cleaned = `${key}__${i}:${cleanVal}`; } normalizedYaml += `${cleaned}\n`; i++; } const obj = yaml.parse(normalizedYaml); return camelizeKeys(obj); } /** * normalize keys to known types * @param obj * @returns */ function normalizeKnownKeysAsAppropriateDataTypes(obj) { // Recursive function to iterate over all keys and child keys function recurse(obj) { for (const key in obj) { if (Array.isArray(obj[key])) { // If the value is an array, recursively call the function for each element obj[key] = obj[key].map((item) => { if (typeof item === "object") { return recurse(item); } return item; }); } else if (typeof obj[key] === "object" && obj[key] !== null) { // If the value is an object, recursively call the function obj[key] = recurse(obj[key]); } if (repeatableValues.includes(key) && !Array.isArray(obj[key])) { obj[key] = [obj[key]]; } } return obj; } const repeatableValues = [ "filedBy", "serialCompany", "subjectCompany", "reportingOwner", "issuer", "filedFor", "filer", "references429", "itemInformation", "groupMembers", "absAssetClass", "absSubAssetClass", "formerCompany", ]; return recurse(obj); } /** * Removes numbered keys from the provided object recursively. * @param {object} obj - The object from which numbered keys should be removed. * @returns {object} - The modified object with numbered keys removed. */ function recursivelyFlattenDuplicateKeysWithNumbers(obj) { let newKey = ""; // Variable to store the modified key without the numbered suffix for (const key in obj) { if (key.includes("__")) { // Check if the key contains '__' newKey = key.replace(/__\d+$/, ""); // Remove the numbered suffix from the key if (obj[newKey] !== undefined) { // Check if the object has the new key if (!Array.isArray(obj[newKey])) { // Check if the value of the new key is not an array obj[newKey] = [obj[newKey]]; // Convert the value to an array } obj[newKey].push(obj[key]); // Push the value of the original key to the array delete obj[key]; // Delete the original key-value pair } else { obj[newKey] = obj[key]; // Set the value of the new key to the value of the original key delete obj[key]; // Delete the original key-value pair } } if (typeof obj[newKey] === "object" && obj[newKey] !== null) { // Check if the value of the new key is an object recursivelyFlattenDuplicateKeysWithNumbers(obj[newKey]); // Recursively traverse nested objects } if (Array.isArray(obj[newKey])) { // Check if the value of the new key is an array obj[newKey] = obj[newKey].map(recursivelyFlattenDuplicateKeysWithNumbers); // Recursively traverse nested arrays } } return obj; // Return the modified object } /** * Parses the SEC header string to extract relevant information. * @param {string} text - The SEC header string to parse. * @returns {Promise<object>} - A promise that resolves to the parsed SEC header object. * @private */ function parseYamlLikeString(text) { let obj = badYamlToObj(text); // first pass, just create an valid object obj = recursivelyFlattenDuplicateKeysWithNumbers(obj); // second pass, clean up the object. obj = normalizeKnownKeysAsAppropriateDataTypes(obj); return obj; } function badXmlToObj(xmlString) { // Split the XML string into lines const lines = xmlString.split("\n"); // Stack to keep track of open tags const stack = []; // Corrected XML string let correctedXML = ""; for (const line of lines) { // Remove leading and trailing whitespace let currentLine = line.trim(); if (currentLine.startsWith("</")) { // Close tag currentLine = `${" ".repeat(stack.length * 2)}${currentLine}\n`; stack.pop(); } else if (currentLine.startsWith("<")) { // Open tag const tagName = currentLine.split("<")[1].split(">")[0]; const tagValue = currentLine.split(">")[1].trim(); if (tagValue && !currentLine.endsWith(">")) { currentLine = `${" ".repeat(stack.length * 2)}${currentLine}</${tagName}>\n`; stack.pop(); } else { currentLine = `${" ".repeat(stack.length * 2)}${currentLine}\n`; } } else if (currentLine.includes(":")) { const [key, value] = currentLine.split(":").map((x) => x.trim()); const expectedKey = key.replace(/ /g, "-"); currentLine = `<${expectedKey}>${value}</${expectedKey}>`; } else { // Content within tag currentLine = `${" ".repeat(stack.length * 2)}${currentLine}\n`; } correctedXML += currentLine; } // Add missing closing tags for open tags with values for (const openTag of stack.reverse()) { correctedXML += `${" ".repeat(stack.length * 2)}</${openTag}>\n`; } const xmlParser = new XMLParser(); let obj = xmlParser.parse(correctedXML); obj = camelizeKeys(obj); return obj.secHeader; } function camelizeKeys(obj) { if (typeof obj !== "object" || obj === null) { return obj; } if (Array.isArray(obj)) { return obj.map((item) => camelizeKeys(item)); } const newObj = {}; for (const key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { const camelCaseKey = toCamelCase(key); newObj[camelCaseKey] = camelizeKeys(obj[key]); } } return newObj; } function trimDocument(file) { const fileLines = file.split("\n"); let endOfYamlLikeContent = 0; let endOfXMLindex = 0; let startOfYamlContent = 0; for (let i = 0; i < fileLines.length; i++) { if (fileLines[i].trim().includes("ACCESSION NUMBER:")) { startOfYamlContent = i; break; } } for (let i = startOfYamlContent; i < fileLines.length; i++) { if (fileLines[i].trim().startsWith("<")) { endOfYamlLikeContent = i; // Return the index of the line break; } } for (let i = endOfYamlLikeContent; i < fileLines.length; i++) { if (fileLines[i].trim() === "</SEC-HEADER>") { endOfXMLindex = i; // Return the index of the line break; } } const yamlLikeStructure = fileLines .slice(startOfYamlContent, endOfYamlLikeContent) .join("\n"); const xmlLikeStructure = `<SEC-HEADER> ${fileLines.slice(endOfYamlLikeContent, endOfXMLindex + 1).join("\n")}`; return { yamlLikeStructure, xmlLikeStructure }; } function callTheSEC(url, userAgent) { return tslib_1.__awaiter(this, void 0, void 0, function* () { const fileResponse = yield fetch(url, { headers: { "user-agent": userAgent }, }); if (fileResponse.status === 403) { throw new Error("FORBIDDEN. SEC rejected the request. Make sure you are using a user agent that identifies yourself."); } return fileResponse.text(); }); } function getObjectFromUrl(url_1) { return tslib_1.__awaiter(this, arguments, void 0, function* (url, userAgent = "") { const doc = yield callTheSEC(url, userAgent); return getObjectFromString(doc); }); } function getObjectFromString(text) { return tslib_1.__awaiter(this, void 0, void 0, function* () { const { yamlLikeStructure, xmlLikeStructure } = trimDocument(text); const xmlObj = badXmlToObj(xmlLikeStructure); const ymlObj = parseYamlLikeString(yamlLikeStructure); const acceptanceDatetimeMatch = text.match(/<ACCEPTANCE-DATETIME>(\d+)/); const acceptanceDatetime = acceptanceDatetimeMatch ? acceptanceDatetimeMatch[1] : ""; return Object.assign(Object.assign({ acceptanceDatetime }, ymlObj), xmlObj); }); }