UNPKG

sec-edgar-parser

Version:

A tool for scraping SEC edgar archive documents found at https://www.sec.gov/Archives/edgar/data

309 lines (308 loc) 12.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.getObjectFromString = exports.getObjectFromUrl = exports.trimDocument = exports.badXmlToObj = exports.parseYamlLikeString = exports.toCamelCase = void 0; const tslib_1 = require("tslib"); const yaml = tslib_1.__importStar(require("yaml")); const { XMLParser } = require("fast-xml-parser"); // Function to convert a string to camelCase function toCamelCase(str) { // Split the string into words based on spaces and hyphens const words = str.split(/[\s-]+/); // Map over the words array and convert each word segment to camelCase const camelCaseString = words .map((word, index) => { // Capitalize the first letter for all word segments except the first one if (index === 0) { return word.toLowerCase(); // Convert the first word segment to lowercase } // Capitalize the first letter and convert the rest to lowercase for subsequent segments return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase(); }) .join(""); // Join the words together to form the camelCase string // Return the camelCase string return camelCaseString; } exports.toCamelCase = toCamelCase; const spaceDepth = (line) => { var _a; return (((_a = line.match(/^ */)) === null || _a === void 0 ? void 0 : _a[0]) || "").length; }; const replaceCharAtIndex = (str, index, newChar) => { return str.substring(0, index) + newChar + str.substring(index + 1); }; const removeNSpaces = (str, N) => { let copy = str; for (let index = 0; index < N; index++) { const char = copy[index]; if (char === " ") { copy = replaceCharAtIndex(copy, index, ""); } } return copy; }; const tabsToSpaces = (str) => { const leadingTabs = str.match(/^\t+/); if (!leadingTabs) return str; const numSpaces = 2 * leadingTabs[0].length; const spaces = " ".repeat(numSpaces); return spaces + str.replace(/^\t+/, ""); }; function badYamlToObj(text) { const lines = text.split("\n").map(tabsToSpaces); const depthOfFirstLine = spaceDepth(lines[0]); let normalizedYaml = ""; let i = 0; for (const line of lines) { let cleaned = removeNSpaces(line, depthOfFirstLine); const colonIndex = line.indexOf(":"); const key = colonIndex === -1 ? line : line.slice(0, colonIndex); const rawVal = colonIndex === -1 ? "" : line.slice(colonIndex + 1); if (key.trim()) { let cleanVal = (rawVal || "").trim().replace(/'/g, "''"); if (cleanVal.trim() === "") { cleanVal = ""; } else { cleanVal = ` '${cleanVal}'`; } cleaned = `${key}__${i}:${cleanVal}`; } normalizedYaml += `${cleaned}\n`; i++; } const obj = yaml.parse(normalizedYaml); return camelizeKeys(obj); } /** * normalize keys to known types * @param obj * @returns */ function normalizeKnownKeysAsAppropriateDataTypes(obj) { // Recursive function to iterate over all keys and child keys function recurse(obj) { for (const key in obj) { if (Array.isArray(obj[key])) { // If the value is an array, recursively call the function for each element obj[key] = obj[key].map((item) => { if (typeof item === "object") { return recurse(item); } return item; }); } else if (typeof obj[key] === "object" && obj[key] !== null) { // If the value is an object, recursively call the function obj[key] = recurse(obj[key]); } if (repeatableValues.includes(key) && !Array.isArray(obj[key])) { obj[key] = [obj[key]]; } } return obj; } const repeatableValues = [ "filedBy", "serialCompany", "subjectCompany", "reportingOwner", "issuer", "filedFor", "filer", "references429", "itemInformation", "groupMembers", "absAssetClass", "absSubAssetClass", "formerCompany", ]; return recurse(obj); } /** * Removes numbered keys from the provided object recursively. * @param {object} obj - The object from which numbered keys should be removed. * @returns {object} - The modified object with numbered keys removed. */ function recursivelyFlattenDuplicateKeysWithNumbers(obj) { let newKey = ""; // Variable to store the modified key without the numbered suffix for (const key in obj) { if (key.includes("__")) { // Check if the key contains '__' newKey = key.replace(/__\d+$/, ""); // Remove the numbered suffix from the key if (obj[newKey] !== undefined) { // Check if the object has the new key if (!Array.isArray(obj[newKey])) { // Check if the value of the new key is not an array obj[newKey] = [obj[newKey]]; // Convert the value to an array } obj[newKey].push(obj[key]); // Push the value of the original key to the array delete obj[key]; // Delete the original key-value pair } else { obj[newKey] = obj[key]; // Set the value of the new key to the value of the original key delete obj[key]; // Delete the original key-value pair } } if (typeof obj[newKey] === "object" && obj[newKey] !== null) { // Check if the value of the new key is an object recursivelyFlattenDuplicateKeysWithNumbers(obj[newKey]); // Recursively traverse nested objects } if (Array.isArray(obj[newKey])) { // Check if the value of the new key is an array obj[newKey] = obj[newKey].map(recursivelyFlattenDuplicateKeysWithNumbers); // Recursively traverse nested arrays } } return obj; // Return the modified object } /** * Parses the SEC header string to extract relevant information. * @param {string} text - The SEC header string to parse. * @returns {Promise<object>} - A promise that resolves to the parsed SEC header object. * @private */ function parseYamlLikeString(text) { let obj = badYamlToObj(text); // first pass, just create an valid object obj = recursivelyFlattenDuplicateKeysWithNumbers(obj); // second pass, clean up the object. obj = normalizeKnownKeysAsAppropriateDataTypes(obj); return obj; } exports.parseYamlLikeString = parseYamlLikeString; function badXmlToObj(xmlString) { // Split the XML string into lines const lines = xmlString.split("\n"); // Stack to keep track of open tags const stack = []; // Corrected XML string let correctedXML = ""; for (const line of lines) { // Remove leading and trailing whitespace let currentLine = line.trim(); if (currentLine.startsWith("</")) { // Close tag currentLine = `${" ".repeat(stack.length * 2)}${currentLine}\n`; stack.pop(); } else if (currentLine.startsWith("<")) { // Open tag const tagName = currentLine.split("<")[1].split(">")[0]; const tagValue = currentLine.split(">")[1].trim(); if (tagValue && !currentLine.endsWith(">")) { currentLine = `${" ".repeat(stack.length * 2)}${currentLine}</${tagName}>\n`; stack.pop(); } else { currentLine = `${" ".repeat(stack.length * 2)}${currentLine}\n`; } } else if (currentLine.includes(":")) { const [key, value] = currentLine.split(":").map((x) => x.trim()); const expectedKey = key.replace(/ /g, "-"); currentLine = `<${expectedKey}>${value}</${expectedKey}>`; } else { // Content within tag currentLine = `${" ".repeat(stack.length * 2)}${currentLine}\n`; } correctedXML += currentLine; } // Add missing closing tags for open tags with values for (const openTag of stack.reverse()) { correctedXML += `${" ".repeat(stack.length * 2)}</${openTag}>\n`; } const xmlParser = new XMLParser(); let obj = xmlParser.parse(correctedXML); obj = camelizeKeys(obj); return obj.secHeader; } exports.badXmlToObj = badXmlToObj; function camelizeKeys(obj) { if (typeof obj !== "object" || obj === null) { return obj; } if (Array.isArray(obj)) { return obj.map((item) => camelizeKeys(item)); } const newObj = {}; for (const key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { const camelCaseKey = toCamelCase(key); newObj[camelCaseKey] = camelizeKeys(obj[key]); } } return newObj; } function normalizeInlineHeaderTag(line) { const trimmed = line.trim(); const match = trimmed.match(/^<([A-Z0-9-]+)>([^<]*)$/); if (!match) return line; const [, tag, value] = match; if (tag === "SEC-HEADER" || tag === "/SEC-HEADER" || tag === "SEC-DOCUMENT") { return line; } const cleanValue = value.trim(); return `${tag}: ${cleanValue}`; } function trimDocument(file) { const fileLines = file.split("\n"); const startOfHeaderIndex = fileLines.findIndex((line) => line.trim().startsWith("<SEC-HEADER>")); const endOfHeaderIndex = fileLines.findIndex((line, idx) => idx > startOfHeaderIndex && line.trim() === "</SEC-HEADER>"); let startOfYamlContent = 0; for (let i = 0; i < fileLines.length; i++) { if (fileLines[i].trim().includes("ACCESSION NUMBER:")) { startOfYamlContent = i; break; } } let endOfYamlLikeContent = -1; for (let i = startOfYamlContent; i < fileLines.length; i++) { const trimmed = fileLines[i].trim(); // first tag-only line (e.g., <SERIES-...>) marks start of XML-ish area if (/^<[^>]+>$/.test(trimmed)) { endOfYamlLikeContent = i; break; } } if (endOfYamlLikeContent === -1) { endOfYamlLikeContent = endOfHeaderIndex !== -1 ? endOfHeaderIndex : fileLines.length; } const yamlLikeStructure = fileLines .slice(startOfYamlContent, endOfYamlLikeContent) .map(normalizeInlineHeaderTag) .join("\n"); const xmlEnd = endOfHeaderIndex !== -1 ? endOfHeaderIndex : fileLines.length; const xmlLikeStructure = `<SEC-HEADER> ${fileLines.slice(endOfYamlLikeContent, xmlEnd + 1).join("\n")}`; return { yamlLikeStructure, xmlLikeStructure }; } exports.trimDocument = trimDocument; function callTheSEC(url, userAgent) { return tslib_1.__awaiter(this, void 0, void 0, function* () { const fileResponse = yield fetch(url, { headers: { "user-agent": userAgent }, }); if (fileResponse.status === 403) { throw new Error("FORBIDDEN. SEC rejected the request. Make sure you are using a user agent that identifies yourself."); } return fileResponse.text(); }); } function getObjectFromUrl(url_1) { return tslib_1.__awaiter(this, arguments, void 0, function* (url, userAgent = "") { const doc = yield callTheSEC(url, userAgent); return getObjectFromString(doc); }); } exports.getObjectFromUrl = getObjectFromUrl; function getObjectFromString(text) { return tslib_1.__awaiter(this, void 0, void 0, function* () { const { yamlLikeStructure, xmlLikeStructure } = trimDocument(text); const xmlObj = badXmlToObj(xmlLikeStructure); const ymlObj = parseYamlLikeString(yamlLikeStructure); const acceptanceDatetimeMatch = text.match(/<ACCEPTANCE-DATETIME>(\d+)/); const acceptanceDatetime = acceptanceDatetimeMatch ? acceptanceDatetimeMatch[1] : ""; return Object.assign(Object.assign({ acceptanceDatetime }, ymlObj), xmlObj); }); } exports.getObjectFromString = getObjectFromString;