UNPKG

@pratiksha90/financial-data-extractors

Version:

Utilities for extracting financial data from various economic calendar websites

588 lines (587 loc) 24.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.TableHandler = void 0; /** * The TableHandler class processes HTML tables and extracts structured data */ class TableHandler { /** * Extract all tables from a document * @param document The document to extract tables from * @returns Array of table data */ extractAllTables(document) { // Check if this might be an economic calendar const isEconomicCalendar = this.checkIfEconomicCalendar(document); // Get all tables on the page const tables = document.querySelectorAll('table'); const results = []; tables.forEach((table, index) => { try { const tableData = this.extractTableData(table, `table-${index}`, document.URL); // Add table type metadata for economic calendars if (isEconomicCalendar) { tableData.metadata.tableType = 'economic_calendar'; } results.push(tableData); } catch (error) { // Skip tables that can't be processed } }); // Also look for grid-like structures that might be tables const pseudoTables = this.extractPseudoTables(document); return [...results, ...pseudoTables]; } /** * Check if the document appears to be an economic calendar */ checkIfEconomicCalendar(document) { // Check URL const url = document.URL.toLowerCase(); if (url.includes('calendar') && (url.includes('economic') || url.includes('forex') || url.includes('trading') || url.includes('finance'))) { return true; } // Check for common economic calendar headers const headers = document.querySelectorAll('th, [role="columnheader"]'); const headerTexts = Array.from(headers).map(h => { var _a; return ((_a = h.textContent) === null || _a === void 0 ? void 0 : _a.trim().toLowerCase()) || ''; }); const economicTerms = [ 'date', 'time', 'event', 'actual', 'forecast', 'previous', 'impact', 'country', 'currency', 'consensus', 'period' ]; // Count how many economic terms are found in headers const economicHeaderCount = economicTerms.filter(term => headerTexts.some(headerText => headerText.includes(term))).length; // If we have 3+ economic terms in headers, likely an economic calendar if (economicHeaderCount >= 3) { return true; } return false; } /** * Extract data from a specific table element */ extractTableData(table, tableIdentifier, url) { // Extract headers const headerRows = table.querySelectorAll('thead tr'); let headers = []; // Try to get headers from thead if (headerRows.length > 0) { // Use the last row in thead as the main headers const headerCells = headerRows[headerRows.length - 1].querySelectorAll('th, td'); headers = Array.from(headerCells).map(cell => this.cleanText(cell.textContent)); } else { // If no thead, try to use the first row as headers const firstRow = table.querySelector('tr'); if (firstRow) { const headerCells = firstRow.querySelectorAll('th, td'); headers = Array.from(headerCells).map(cell => this.cleanText(cell.textContent)); } } // If no headers found or all headers are empty, create generic ones if (headers.length === 0 || headers.every(h => h === '')) { const maxColumns = this.getMaxColumns(table); headers = Array.from({ length: maxColumns }, (_, i) => `Column ${i + 1}`); } // Make headers unique headers = this.makeHeadersUnique(headers); // Create 2D grid to handle rowspan/colspan const grid = this.createTableGrid(table, headers.length); // Convert grid to row objects const rows = this.convertGridToRows(grid, headers); // Create metadata const metadata = { tableName: table.getAttribute('name') || table.getAttribute('aria-label') || undefined, tableId: table.id || undefined, tableClass: table.className || undefined, url: url, timestamp: new Date().toISOString(), rowCount: rows.length, columnCount: headers.length }; return { headers, rows, metadata }; } /** * Make headers unique by adding a suffix to duplicates */ makeHeadersUnique(headers) { const uniqueHeaders = []; const headerCounts = {}; headers.forEach(header => { const cleanHeader = header || 'Column'; if (headerCounts[cleanHeader]) { headerCounts[cleanHeader]++; uniqueHeaders.push(`${cleanHeader} (${headerCounts[cleanHeader]})`); } else { headerCounts[cleanHeader] = 1; uniqueHeaders.push(cleanHeader); } }); return uniqueHeaders; } /** * Create a 2D grid representation of the table to handle rowspan/colspan */ createTableGrid(table, columnCount) { const rows = table.querySelectorAll('tbody tr, tr'); const isHeaderRow = Array.from(rows).map(row => { var _a; return ((_a = row.parentElement) === null || _a === void 0 ? void 0 : _a.tagName.toLowerCase()) === 'thead' || row.querySelector('th') !== null; }); // Create an empty grid const grid = []; // Create a map to store row metadata const rowMetadataMap = new Map(); // Skip the first row if it's a header row and we have a thead const startRow = table.querySelector('thead') ? 0 : isHeaderRow[0] ? 1 : 0; // Process each row for (let rowIndex = startRow; rowIndex < rows.length; rowIndex++) { const row = rows[rowIndex]; // Skip header rows if (isHeaderRow[rowIndex]) { continue; } // Get all cells in this row const cells = row.querySelectorAll('td, th'); // Create a new grid row if needed if (!grid[rowIndex - startRow]) { grid[rowIndex - startRow] = []; // Store row metadata const dataUrl = row.getAttribute('data-url'); const dataId = row.getAttribute('data-id'); const dataCountry = row.getAttribute('data-country'); // Only add metadata if there's something to add if (dataUrl || dataId || dataCountry) { // Store in the metadata map rowMetadataMap.set(rowIndex - startRow, { dataUrl, dataId, dataCountry }); } } let gridColIndex = 0; // For each cell in the row for (let cellIndex = 0; cellIndex < cells.length; cellIndex++) { const cell = cells[cellIndex]; // Skip occupied grid cells (from previous rowspan/colspan) while (grid[rowIndex - startRow][gridColIndex] !== undefined) { gridColIndex++; } // Get rowspan and colspan const rowSpan = parseInt(cell.getAttribute('rowspan') || '1', 10); const colSpan = parseInt(cell.getAttribute('colspan') || '1', 10); // Get cell content const rawText = cell.textContent || ''; const value = this.convertCellValue(rawText); // Create cell data const cellData = { value, rawText: rawText.trim(), rowspan: rowSpan > 1 ? rowSpan : undefined, colspan: colSpan > 1 ? colSpan : undefined }; // Fill the grid with this cell for (let rs = 0; rs < rowSpan; rs++) { for (let cs = 0; cs < colSpan; cs++) { // Create new rows if needed for rowspan if (!grid[rowIndex - startRow + rs]) { grid[rowIndex - startRow + rs] = []; } if (rs === 0 && cs === 0) { // Original cell position grid[rowIndex - startRow][gridColIndex] = cellData; } else { // Placeholder for spanned cells grid[rowIndex - startRow + rs][gridColIndex + cs] = { value: null, rawText: '', spannedBy: { row: rowIndex - startRow, col: gridColIndex } }; } } } // Move to the next column position gridColIndex += colSpan; } } return [grid, rowMetadataMap]; } /** * Convert the grid to row objects using headers */ convertGridToRows(gridData, headers) { const [grid, rowMetadataMap] = gridData; const rows = []; // For each row in the grid for (let rowIndex = 0; rowIndex < grid.length; rowIndex++) { const gridRow = grid[rowIndex]; const rowData = {}; // Check if row is a spacer or has no content const hasContent = gridRow && gridRow.some((cell) => cell && cell.rawText && cell.rawText.trim() !== ''); if (!hasContent) continue; // For each column in the row for (let colIndex = 0; colIndex < headers.length; colIndex++) { const header = headers[colIndex]; const cell = gridRow[colIndex]; // Use empty cell if undefined if (!cell) { rowData[header] = { value: null, rawText: '' }; continue; } // Skip spanned cells (they are placeholders) if (cell.spannedBy) { const { row, col } = cell.spannedBy; const originalCell = grid[row][col]; rowData[header] = { value: originalCell.value, rawText: originalCell.rawText }; continue; } // Add the cell data rowData[header] = cell; } // Transfer row metadata if available if (rowMetadataMap.has(rowIndex)) { rowData._rowMetadata = rowMetadataMap.get(rowIndex); } rows.push(rowData); } return rows; } /** * Helper method to find the maximum number of columns in any row */ getMaxColumns(table) { const rows = table.querySelectorAll('tr'); let maxColumns = 0; rows.forEach(row => { const cells = row.querySelectorAll('th, td'); let colCount = 0; // Count columns including colspan cells.forEach(cell => { const colspan = parseInt(cell.getAttribute('colspan') || '1', 10); colCount += colspan; }); maxColumns = Math.max(maxColumns, colCount); }); return maxColumns; } /** * Clean and normalize text from a cell */ cleanText(text) { if (!text) return ''; // Remove extra whitespace and normalize return text.replace(/\s+/g, ' ').trim(); } /** * Convert cell value to appropriate type */ convertCellValue(text) { const trimmedText = text.trim(); // Check for empty values if (!trimmedText || /^(n\/a|na|none|-)$/i.test(trimmedText)) { return null; } // Try to extract a clean numeric value if the text appears to be a number const numericMatch = trimmedText.match(/^[+-]?\$?\s*[\d,]+\.?\d*%?$/); if (numericMatch) { // Remove currency symbols, commas, and spaces let cleanNumber = trimmedText.replace(/[$,\s]+/g, ''); // Handle percentage values (convert to decimal) if (cleanNumber.endsWith('%')) { cleanNumber = cleanNumber.replace('%', ''); return parseFloat(cleanNumber) / 100; } // Parse as a regular number return parseFloat(cleanNumber); } // Check for boolean values if (/^(true|yes|y)$/i.test(trimmedText)) { return true; } if (/^(false|no|n)$/i.test(trimmedText)) { return false; } // Return as string for non-numeric values return trimmedText; } /** * Try to find and extract tables from non-standard or pseudo-table structures */ extractPseudoTables(document) { const results = []; // Look for common pseudo-table patterns const divTables = document.querySelectorAll('[role="table"], [class*="table"], [class*="grid"]'); divTables.forEach((container, index) => { try { // Skip if it's a real table (we already processed those) if (container.tagName === 'TABLE') { return; } // Look for header row elements const headerRow = container.querySelector('[role="rowheader"], [class*="header"], [class*="heading"]'); if (!headerRow) return; // Extract headers const headerCells = headerRow.querySelectorAll('[role="columnheader"], [class*="cell"], [class*="column"]'); if (headerCells.length === 0) return; const headers = Array.from(headerCells).map(cell => this.cleanText(cell.textContent)); // Look for data rows const dataRows = container.querySelectorAll('[role="row"]:not([role="rowheader"]), [class*="row"]:not([class*="header"])'); if (dataRows.length === 0) return; // Extract data from rows const rows = []; dataRows.forEach(row => { const cells = row.querySelectorAll('[role="cell"], [class*="cell"]'); if (cells.length === 0) return; const rowData = {}; cells.forEach((cell, cellIndex) => { const header = cellIndex < headers.length ? headers[cellIndex] : `Column ${cellIndex + 1}`; const rawText = cell.textContent || ''; const value = this.convertCellValue(rawText); rowData[header] = { value, rawText: rawText.trim() }; }); rows.push(rowData); }); if (rows.length > 0) { results.push({ headers, rows, metadata: { tableName: 'Grid Table ' + (index + 1), tableId: container.id || undefined, tableClass: container.className || undefined, url: document.URL, timestamp: new Date().toISOString(), rowCount: rows.length, columnCount: headers.length, tableType: 'pseudo_table' } }); } } catch (error) { // Skip tables that can't be processed } }); return results; } /** * Process tables for economic calendar format */ processEconomicCalendarData(tables) { // Find tables that look like economic calendars const calendarTables = tables.filter(table => { const headers = table.headers.map(h => h.toLowerCase()); const economicTerms = ['date', 'time', 'event', 'actual', 'forecast', 'previous']; return economicTerms.filter(term => headers.some(h => h.includes(term))).length >= 3; }); if (calendarTables.length === 0) { return { tables }; } // Process each calendar table const processedCalendars = calendarTables.map(table => { // Map column headers to standard names const headerMap = this.mapCalendarHeaders(table.headers); // Extract events with standardized fields const events = []; let currentDate = null; for (const row of table.rows) { // Skip empty rows if (Object.values(row).every(cell => !cell.rawText)) { continue; } // Check if this row contains a date const dateCell = this.extractDateFromRow(row, headerMap); if (dateCell) { currentDate = dateCell; continue; } // Extract event data const eventData = this.extractEventData(row, headerMap, currentDate); if (eventData) { events.push(eventData); } } return { metadata: { title: "Economic Calendar", source: table.metadata.url, extractedAt: new Date().toISOString() }, events: events }; }); return { calendar: processedCalendars[0] }; } /** * Map table headers to standard economic calendar fields */ mapCalendarHeaders(headers) { const headerMap = {}; headers.forEach(header => { const lowerHeader = header.toLowerCase(); // Map date/time headers if (lowerHeader.includes('date')) { headerMap['date'] = header; } else if (lowerHeader.includes('time')) { headerMap['time'] = header; } // Map event headers if (lowerHeader.includes('event') || lowerHeader.includes('indicator') || lowerHeader.includes('description')) { headerMap['event'] = header; } // Map country/currency headers if (lowerHeader.includes('country') || lowerHeader.includes('currency')) { headerMap['country'] = header; } // Map data value headers if (lowerHeader.includes('actual') || lowerHeader === 'act') { headerMap['actual'] = header; } if (lowerHeader.includes('forecast') || lowerHeader.includes('consensus') || lowerHeader === 'cons' || lowerHeader === 'fcst') { headerMap['forecast'] = header; } if (lowerHeader.includes('previous') || lowerHeader === 'prev') { headerMap['previous'] = header; } if (lowerHeader.includes('revised')) { headerMap['revised'] = header; } // Map impact/importance headers if (lowerHeader.includes('impact') || lowerHeader.includes('importance')) { headerMap['impact'] = header; } }); return headerMap; } /** * Extract a date from a row if present */ extractDateFromRow(row, headerMap) { var _a; // Check for a date in a dedicated date column if (headerMap['date'] && ((_a = row[headerMap['date']]) === null || _a === void 0 ? void 0 : _a.rawText)) { const dateText = row[headerMap['date']].rawText; if (this.isDateString(dateText)) { return dateText; } } // Check for a row that contains only a date (header row) const nonEmptyCells = Object.keys(row).filter(key => row[key].rawText); if (nonEmptyCells.length === 1) { const cellText = row[nonEmptyCells[0]].rawText; if (this.isDateString(cellText)) { return cellText; } } return null; } /** * Check if a string looks like a date */ isDateString(text) { // Skip empty text if (!text) return false; // Check for date patterns const datePatterns = [ /^\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}$/, // DD/MM/YYYY or MM/DD/YYYY /^\d{4}[-\/]\d{1,2}[-\/]\d{1,2}$/, // YYYY/MM/DD /^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(st|nd|rd|th)?(,? \d{4})?$/i, // Month DD, YYYY /^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),? [A-Za-z]+ \d{1,2}(st|nd|rd|th)?(,? \d{4})?$/i, // Weekday Month DD, YYYY /^(Today|Tomorrow|Yesterday)$/i // Special day names ]; return datePatterns.some(pattern => pattern.test(text)); } /** * Extract event data from a row */ extractEventData(row, headerMap, currentDate) { var _a; // Skip rows with no event description if (!headerMap['event'] || !((_a = row[headerMap['event']]) === null || _a === void 0 ? void 0 : _a.rawText)) { return null; } // Create standardized event object const event = {}; // Add date if available if (currentDate) { event['Date'] = currentDate; } // Map fields from headers const fieldMappings = [ { source: 'event', target: 'Report' }, { source: 'time', target: 'Time' }, { source: 'country', target: 'Country' }, { source: 'actual', target: 'Actual' }, { source: 'forecast', target: 'Consensus' }, { source: 'previous', target: 'Previous' }, { source: 'revised', target: 'Revised' }, { source: 'impact', target: 'Impact' } ]; fieldMappings.forEach(mapping => { var _a; if (headerMap[mapping.source] && ((_a = row[headerMap[mapping.source]]) === null || _a === void 0 ? void 0 : _a.value) !== undefined) { event[mapping.target] = row[headerMap[mapping.source]].value; } }); // Extract period if not present but in the event text if (!event['Period'] && event['Report']) { const periodMatch = String(event['Report']).match(/\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b|\b(Q[1-4])[ -]\d{4}\b|\b\d{4}[ -]Q[1-4]\b|\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/i); if (periodMatch) { event['Period'] = periodMatch[0]; } } return event; } /** * Convert table data to a simplified format suitable for JSON export */ simplifyTableData(tableData) { const simpleRows = tableData.rows.map(row => { const simpleRow = {}; Object.entries(row).forEach(([key, cellData]) => { // If the cell data is an object with a value property, use that if (cellData && typeof cellData === 'object' && 'value' in cellData) { simpleRow[key] = cellData.value; } else { // Otherwise, use the data as is simpleRow[key] = cellData; } }); return simpleRow; }); // Preserve all metadata return { headers: tableData.headers, rows: simpleRows, metadata: tableData.metadata }; } /** * Convert all tables to a simplified format */ simplifyAllTables(tables) { return tables.map(table => this.simplifyTableData(table)); } } exports.TableHandler = TableHandler;