UNPKG

sec-edgar-toolkit

Version:

Open source toolkit to facilitate working with the SEC EDGAR database

276 lines 12.5 kB
"use strict"; /** * SEC Filing Item Extractor * * Extracts individual items from SEC filings (10-K, 10-Q, 8-K, etc.) * based on standard item numbering and structure. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.ItemExtractor = exports.FormType = void 0; const errors_1 = require("../exceptions/errors"); var FormType; (function (FormType) { FormType["FORM_10K"] = "10-K"; FormType["FORM_10Q"] = "10-Q"; FormType["FORM_8K"] = "8-K"; FormType["FORM_20F"] = "20-F"; FormType["FORM_40F"] = "40-F"; })(FormType || (exports.FormType = FormType = {})); class ItemExtractor { constructor() { this.formItems = new Map([ [FormType.FORM_10K, ItemExtractor.FORM_10K_ITEMS], [FormType.FORM_10Q, ItemExtractor.FORM_10Q_ITEMS], [FormType.FORM_8K, ItemExtractor.FORM_8K_ITEMS], ]); } /** * Extract all items from a filing * @param content The filing content (HTML or text) * @param formType The type of form (e.g., "10-K", "10-Q", "8-K") * @returns Dictionary mapping item numbers to their content */ extractItems(content, formType) { // Convert string form type to enum const parsedFormType = typeof formType === 'string' ? this.parseFormType(formType) : formType; if (!this.formItems.has(parsedFormType)) { throw new errors_1.InvalidFormTypeError(formType.toString(), ['10-K', '10-Q', '8-K', '20-F', '40-F']); } // Clean content const cleanContent = this.cleanContent(content); // Extract table of contents if available const tocItems = this.extractTableOfContents(cleanContent); // Extract items const items = this.extractItemsFromContent(cleanContent, parsedFormType, tocItems); // Post-process and validate return this.postProcessItems(items, parsedFormType); } /** * Extract specific items from a filing * @param content The filing content * @param formType The type of form * @param itemNumbers List of item numbers to extract * @returns Dictionary with only the requested items */ extractSpecificItems(content, formType, itemNumbers) { const allItems = this.extractItems(content, formType); const result = {}; for (const itemNum of itemNumbers) { if (itemNum in allItems) { result[itemNum] = allItems[itemNum]; } } return result; } /** * Get item definitions for a specific form type * @param formType The form type * @returns List of item definitions */ getItemDefinitions(formType) { const parsedFormType = typeof formType === 'string' ? this.parseFormType(formType) : formType; return this.formItems.get(parsedFormType) || []; } parseFormType(formTypeStr) { const upperType = formTypeStr.toUpperCase(); if (upperType.includes("10-K") || upperType.includes("10K")) { return FormType.FORM_10K; } else if (upperType.includes("10-Q") || upperType.includes("10Q")) { return FormType.FORM_10Q; } else if (upperType.includes("8-K") || upperType.includes("8K")) { return FormType.FORM_8K; } else if (upperType.includes("20-F") || upperType.includes("20F")) { return FormType.FORM_20F; } else if (upperType.includes("40-F") || upperType.includes("40F")) { return FormType.FORM_40F; } else { throw new errors_1.InvalidFormTypeError(formTypeStr, ['10-K', '10-Q', '8-K', '20-F', '40-F']); } } cleanContent(content) { // Remove HTML tags but preserve structure let cleaned = content.replace(/<[^>]+>/g, ' '); // Normalize whitespace cleaned = cleaned.replace(/\s+/g, ' '); // Preserve line breaks for item boundaries cleaned = cleaned.replace(/(Item\s+\d+[A-Z]?\.)/gi, '\n\n$1'); return cleaned.trim(); } extractTableOfContents(content) { const tocItems = []; // Look for table of contents section const tocMatch = content.match(/TABLE\s+OF\s+CONTENTS(.*?)(?:Item\s+1\.|PART\s+I\s)/is); if (tocMatch) { const tocContent = tocMatch[1]; // Extract item references from TOC const itemPattern = /Item\s+(\d+[A-Z]?)\.\s*([^\n\r.]+)/gi; let match; while ((match = itemPattern.exec(tocContent)) !== null) { const itemNum = match[1].toUpperCase(); tocItems.push([itemNum, match.index]); } } return tocItems; } extractItemsFromContent(content, formType, tocItems) { const items = new Map(); const itemDefinitions = this.formItems.get(formType) || []; for (const itemDef of itemDefinitions) { // Build patterns for each item const patterns = [ `Item\\s+${this.escapeRegex(itemDef.number)}\\.\\s*${this.escapeRegex(itemDef.title)}`, `Item\\s+${this.escapeRegex(itemDef.number)}\\.\\s*(?=[A-Z])`, `Item\\s+${this.escapeRegex(itemDef.number)}(?:\\.|:|\\s)`, ]; // Add alias patterns if (itemDef.aliases) { for (const alias of itemDef.aliases) { patterns.push(this.escapeRegex(alias)); } } // Try each pattern for (const pattern of patterns) { const regex = new RegExp(pattern, 'gi'); const matches = Array.from(content.matchAll(regex)); if (matches.length > 0) { // Use the first match after TOC (if TOC exists) let match = matches[0]; if (matches.length > 1 && tocItems.length > 0) { // Skip matches that appear in TOC for (const m of matches.slice(1)) { if (!this.isInToc(m.index, tocItems)) { match = m; break; } } } const startPos = match.index; // Find the end position (start of next item) const endPos = this.findItemEnd(content, startPos, itemDefinitions); // Extract content const itemContent = content.substring(startPos, endPos).trim(); items.set(itemDef.number, { itemNumber: itemDef.number, title: itemDef.title, content: itemContent, startPosition: startPos, endPosition: endPos, }); break; } } } return items; } isInToc(position, tocItems) { if (tocItems.length === 0) return false; // Rough heuristic: if position is before the last TOC item + buffer const lastTocPos = Math.max(...tocItems.map(item => item[1])); return position < lastTocPos + 500; } findItemEnd(content, startPos, _itemDefinitions) { // Look for the next item const nextItemPattern = /Item\s+\d+[A-Z]?[.:]\s*[A-Z]/i; const remainingContent = content.substring(startPos + 10); const match = remainingContent.match(nextItemPattern); if (match && match.index !== undefined) { return startPos + 10 + match.index; } else { // No next item found, return end of content return content.length; } } postProcessItems(items, _formType) { const processed = {}; for (const [itemNum, extractedItem] of items) { // Clean up the content let content = extractedItem.content; // Remove excessive whitespace content = content.replace(/\n\s*\n\s*\n/g, '\n\n'); // Ensure we have some content if (content.trim().length > 50) { processed[itemNum] = content; } else { // Try to handle empty or placeholder items if (content.toLowerCase().includes("none") || content.toLowerCase().includes("not applicable")) { processed[itemNum] = content; } else { processed[itemNum] = ""; } } } return processed; } escapeRegex(str) { return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } } exports.ItemExtractor = ItemExtractor; // 10-K Item definitions ItemExtractor.FORM_10K_ITEMS = [ { number: "1", title: "Business" }, { number: "1A", title: "Risk Factors" }, { number: "1B", title: "Unresolved Staff Comments" }, { number: "1C", title: "Cybersecurity", required: false }, { number: "2", title: "Properties" }, { number: "3", title: "Legal Proceedings" }, { number: "4", title: "Mine Safety Disclosures", required: false }, { number: "5", title: "Market for Registrant's Common Equity" }, { number: "6", title: "Reserved", required: false }, { number: "7", title: "Management's Discussion and Analysis", aliases: ["MD&A"] }, { number: "7A", title: "Quantitative and Qualitative Disclosures About Market Risk" }, { number: "8", title: "Financial Statements and Supplementary Data" }, { number: "9", title: "Changes in and Disagreements with Accountants" }, { number: "9A", title: "Controls and Procedures" }, { number: "9B", title: "Other Information" }, { number: "9C", title: "Disclosure Regarding Foreign Jurisdictions", required: false }, { number: "10", title: "Directors, Executive Officers and Corporate Governance" }, { number: "11", title: "Executive Compensation" }, { number: "12", title: "Security Ownership" }, { number: "13", title: "Certain Relationships and Related Transactions" }, { number: "14", title: "Principal Accountant Fees and Services" }, { number: "15", title: "Exhibits and Financial Statement Schedules" }, ]; // 10-Q Item definitions ItemExtractor.FORM_10Q_ITEMS = [ { number: "1", title: "Financial Statements" }, { number: "2", title: "Management's Discussion and Analysis", aliases: ["MD&A"] }, { number: "3", title: "Quantitative and Qualitative Disclosures About Market Risk" }, { number: "4", title: "Controls and Procedures" }, { number: "1", title: "Legal Proceedings", aliases: ["Part II, Item 1"] }, { number: "1A", title: "Risk Factors", aliases: ["Part II, Item 1A"] }, { number: "2", title: "Unregistered Sales of Equity Securities", aliases: ["Part II, Item 2"] }, { number: "3", title: "Defaults Upon Senior Securities", aliases: ["Part II, Item 3"] }, { number: "4", title: "Mine Safety Disclosures", aliases: ["Part II, Item 4"], required: false }, { number: "5", title: "Other Information", aliases: ["Part II, Item 5"] }, { number: "6", title: "Exhibits", aliases: ["Part II, Item 6"] }, ]; // 8-K Item definitions ItemExtractor.FORM_8K_ITEMS = [ { number: "1.01", title: "Entry into a Material Definitive Agreement" }, { number: "1.02", title: "Termination of a Material Definitive Agreement" }, { number: "2.01", title: "Completion of Acquisition or Disposition of Assets" }, { number: "2.02", title: "Results of Operations and Financial Condition" }, { number: "2.03", title: "Creation of a Direct Financial Obligation" }, { number: "3.01", title: "Notice of Delisting or Failure to Satisfy" }, { number: "3.02", title: "Unregistered Sales of Equity Securities" }, { number: "4.01", title: "Changes in Registrant's Certifying Accountant" }, { number: "4.02", title: "Non-Reliance on Previously Issued Financial Statements" }, { number: "5.01", title: "Changes in Control of Registrant" }, { number: "5.02", title: "Departure of Directors or Certain Officers" }, { number: "5.03", title: "Amendments to Articles of Incorporation or Bylaws" }, { number: "7.01", title: "Regulation FD Disclosure" }, { number: "8.01", title: "Other Events" }, { number: "9.01", title: "Financial Statements and Exhibits" }, ]; //# sourceMappingURL=item-extractor.js.map