@pratiksha90/financial-data-extractors
Version:
Utilities for extracting financial data from various economic calendar websites
588 lines (587 loc) • 24.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.TableHandler = void 0;
/**
* The TableHandler class processes HTML tables and extracts structured data
*/
class TableHandler {
/**
* Extract all tables from a document
* @param document The document to extract tables from
* @returns Array of table data
*/
extractAllTables(document) {
// Check if this might be an economic calendar
const isEconomicCalendar = this.checkIfEconomicCalendar(document);
// Get all tables on the page
const tables = document.querySelectorAll('table');
const results = [];
tables.forEach((table, index) => {
try {
const tableData = this.extractTableData(table, `table-${index}`, document.URL);
// Add table type metadata for economic calendars
if (isEconomicCalendar) {
tableData.metadata.tableType = 'economic_calendar';
}
results.push(tableData);
}
catch (error) {
// Skip tables that can't be processed
}
});
// Also look for grid-like structures that might be tables
const pseudoTables = this.extractPseudoTables(document);
return [...results, ...pseudoTables];
}
/**
* Check if the document appears to be an economic calendar
*/
checkIfEconomicCalendar(document) {
// Check URL
const url = document.URL.toLowerCase();
if (url.includes('calendar') &&
(url.includes('economic') ||
url.includes('forex') ||
url.includes('trading') ||
url.includes('finance'))) {
return true;
}
// Check for common economic calendar headers
const headers = document.querySelectorAll('th, [role="columnheader"]');
const headerTexts = Array.from(headers).map(h => { var _a; return ((_a = h.textContent) === null || _a === void 0 ? void 0 : _a.trim().toLowerCase()) || ''; });
const economicTerms = [
'date', 'time', 'event', 'actual', 'forecast', 'previous',
'impact', 'country', 'currency', 'consensus', 'period'
];
// Count how many economic terms are found in headers
const economicHeaderCount = economicTerms.filter(term => headerTexts.some(headerText => headerText.includes(term))).length;
// If we have 3+ economic terms in headers, likely an economic calendar
if (economicHeaderCount >= 3) {
return true;
}
return false;
}
/**
* Extract data from a specific table element
*/
extractTableData(table, tableIdentifier, url) {
// Extract headers
const headerRows = table.querySelectorAll('thead tr');
let headers = [];
// Try to get headers from thead
if (headerRows.length > 0) {
// Use the last row in thead as the main headers
const headerCells = headerRows[headerRows.length - 1].querySelectorAll('th, td');
headers = Array.from(headerCells).map(cell => this.cleanText(cell.textContent));
}
else {
// If no thead, try to use the first row as headers
const firstRow = table.querySelector('tr');
if (firstRow) {
const headerCells = firstRow.querySelectorAll('th, td');
headers = Array.from(headerCells).map(cell => this.cleanText(cell.textContent));
}
}
// If no headers found or all headers are empty, create generic ones
if (headers.length === 0 || headers.every(h => h === '')) {
const maxColumns = this.getMaxColumns(table);
headers = Array.from({ length: maxColumns }, (_, i) => `Column ${i + 1}`);
}
// Make headers unique
headers = this.makeHeadersUnique(headers);
// Create 2D grid to handle rowspan/colspan
const grid = this.createTableGrid(table, headers.length);
// Convert grid to row objects
const rows = this.convertGridToRows(grid, headers);
// Create metadata
const metadata = {
tableName: table.getAttribute('name') || table.getAttribute('aria-label') || undefined,
tableId: table.id || undefined,
tableClass: table.className || undefined,
url: url,
timestamp: new Date().toISOString(),
rowCount: rows.length,
columnCount: headers.length
};
return { headers, rows, metadata };
}
/**
* Make headers unique by adding a suffix to duplicates
*/
makeHeadersUnique(headers) {
const uniqueHeaders = [];
const headerCounts = {};
headers.forEach(header => {
const cleanHeader = header || 'Column';
if (headerCounts[cleanHeader]) {
headerCounts[cleanHeader]++;
uniqueHeaders.push(`${cleanHeader} (${headerCounts[cleanHeader]})`);
}
else {
headerCounts[cleanHeader] = 1;
uniqueHeaders.push(cleanHeader);
}
});
return uniqueHeaders;
}
/**
* Create a 2D grid representation of the table to handle rowspan/colspan
*/
createTableGrid(table, columnCount) {
const rows = table.querySelectorAll('tbody tr, tr');
const isHeaderRow = Array.from(rows).map(row => { var _a; return ((_a = row.parentElement) === null || _a === void 0 ? void 0 : _a.tagName.toLowerCase()) === 'thead' || row.querySelector('th') !== null; });
// Create an empty grid
const grid = [];
// Create a map to store row metadata
const rowMetadataMap = new Map();
// Skip the first row if it's a header row and we have a thead
const startRow = table.querySelector('thead') ? 0 : isHeaderRow[0] ? 1 : 0;
// Process each row
for (let rowIndex = startRow; rowIndex < rows.length; rowIndex++) {
const row = rows[rowIndex];
// Skip header rows
if (isHeaderRow[rowIndex]) {
continue;
}
// Get all cells in this row
const cells = row.querySelectorAll('td, th');
// Create a new grid row if needed
if (!grid[rowIndex - startRow]) {
grid[rowIndex - startRow] = [];
// Store row metadata
const dataUrl = row.getAttribute('data-url');
const dataId = row.getAttribute('data-id');
const dataCountry = row.getAttribute('data-country');
// Only add metadata if there's something to add
if (dataUrl || dataId || dataCountry) {
// Store in the metadata map
rowMetadataMap.set(rowIndex - startRow, {
dataUrl,
dataId,
dataCountry
});
}
}
let gridColIndex = 0;
// For each cell in the row
for (let cellIndex = 0; cellIndex < cells.length; cellIndex++) {
const cell = cells[cellIndex];
// Skip occupied grid cells (from previous rowspan/colspan)
while (grid[rowIndex - startRow][gridColIndex] !== undefined) {
gridColIndex++;
}
// Get rowspan and colspan
const rowSpan = parseInt(cell.getAttribute('rowspan') || '1', 10);
const colSpan = parseInt(cell.getAttribute('colspan') || '1', 10);
// Get cell content
const rawText = cell.textContent || '';
const value = this.convertCellValue(rawText);
// Create cell data
const cellData = {
value,
rawText: rawText.trim(),
rowspan: rowSpan > 1 ? rowSpan : undefined,
colspan: colSpan > 1 ? colSpan : undefined
};
// Fill the grid with this cell
for (let rs = 0; rs < rowSpan; rs++) {
for (let cs = 0; cs < colSpan; cs++) {
// Create new rows if needed for rowspan
if (!grid[rowIndex - startRow + rs]) {
grid[rowIndex - startRow + rs] = [];
}
if (rs === 0 && cs === 0) {
// Original cell position
grid[rowIndex - startRow][gridColIndex] = cellData;
}
else {
// Placeholder for spanned cells
grid[rowIndex - startRow + rs][gridColIndex + cs] = {
value: null,
rawText: '',
spannedBy: { row: rowIndex - startRow, col: gridColIndex }
};
}
}
}
// Move to the next column position
gridColIndex += colSpan;
}
}
return [grid, rowMetadataMap];
}
/**
* Convert the grid to row objects using headers
*/
convertGridToRows(gridData, headers) {
const [grid, rowMetadataMap] = gridData;
const rows = [];
// For each row in the grid
for (let rowIndex = 0; rowIndex < grid.length; rowIndex++) {
const gridRow = grid[rowIndex];
const rowData = {};
// Check if row is a spacer or has no content
const hasContent = gridRow && gridRow.some((cell) => cell && cell.rawText && cell.rawText.trim() !== '');
if (!hasContent)
continue;
// For each column in the row
for (let colIndex = 0; colIndex < headers.length; colIndex++) {
const header = headers[colIndex];
const cell = gridRow[colIndex];
// Use empty cell if undefined
if (!cell) {
rowData[header] = { value: null, rawText: '' };
continue;
}
// Skip spanned cells (they are placeholders)
if (cell.spannedBy) {
const { row, col } = cell.spannedBy;
const originalCell = grid[row][col];
rowData[header] = {
value: originalCell.value,
rawText: originalCell.rawText
};
continue;
}
// Add the cell data
rowData[header] = cell;
}
// Transfer row metadata if available
if (rowMetadataMap.has(rowIndex)) {
rowData._rowMetadata = rowMetadataMap.get(rowIndex);
}
rows.push(rowData);
}
return rows;
}
/**
* Helper method to find the maximum number of columns in any row
*/
getMaxColumns(table) {
const rows = table.querySelectorAll('tr');
let maxColumns = 0;
rows.forEach(row => {
const cells = row.querySelectorAll('th, td');
let colCount = 0;
// Count columns including colspan
cells.forEach(cell => {
const colspan = parseInt(cell.getAttribute('colspan') || '1', 10);
colCount += colspan;
});
maxColumns = Math.max(maxColumns, colCount);
});
return maxColumns;
}
/**
* Clean and normalize text from a cell
*/
cleanText(text) {
if (!text)
return '';
// Remove extra whitespace and normalize
return text.replace(/\s+/g, ' ').trim();
}
/**
* Convert cell value to appropriate type
*/
convertCellValue(text) {
const trimmedText = text.trim();
// Check for empty values
if (!trimmedText || /^(n\/a|na|none|-)$/i.test(trimmedText)) {
return null;
}
// Try to extract a clean numeric value if the text appears to be a number
const numericMatch = trimmedText.match(/^[+-]?\$?\s*[\d,]+\.?\d*%?$/);
if (numericMatch) {
// Remove currency symbols, commas, and spaces
let cleanNumber = trimmedText.replace(/[$,\s]+/g, '');
// Handle percentage values (convert to decimal)
if (cleanNumber.endsWith('%')) {
cleanNumber = cleanNumber.replace('%', '');
return parseFloat(cleanNumber) / 100;
}
// Parse as a regular number
return parseFloat(cleanNumber);
}
// Check for boolean values
if (/^(true|yes|y)$/i.test(trimmedText)) {
return true;
}
if (/^(false|no|n)$/i.test(trimmedText)) {
return false;
}
// Return as string for non-numeric values
return trimmedText;
}
/**
* Try to find and extract tables from non-standard or pseudo-table structures
*/
extractPseudoTables(document) {
const results = [];
// Look for common pseudo-table patterns
const divTables = document.querySelectorAll('[role="table"], [class*="table"], [class*="grid"]');
divTables.forEach((container, index) => {
try {
// Skip if it's a real table (we already processed those)
if (container.tagName === 'TABLE') {
return;
}
// Look for header row elements
const headerRow = container.querySelector('[role="rowheader"], [class*="header"], [class*="heading"]');
if (!headerRow)
return;
// Extract headers
const headerCells = headerRow.querySelectorAll('[role="columnheader"], [class*="cell"], [class*="column"]');
if (headerCells.length === 0)
return;
const headers = Array.from(headerCells).map(cell => this.cleanText(cell.textContent));
// Look for data rows
const dataRows = container.querySelectorAll('[role="row"]:not([role="rowheader"]), [class*="row"]:not([class*="header"])');
if (dataRows.length === 0)
return;
// Extract data from rows
const rows = [];
dataRows.forEach(row => {
const cells = row.querySelectorAll('[role="cell"], [class*="cell"]');
if (cells.length === 0)
return;
const rowData = {};
cells.forEach((cell, cellIndex) => {
const header = cellIndex < headers.length ? headers[cellIndex] : `Column ${cellIndex + 1}`;
const rawText = cell.textContent || '';
const value = this.convertCellValue(rawText);
rowData[header] = {
value,
rawText: rawText.trim()
};
});
rows.push(rowData);
});
if (rows.length > 0) {
results.push({
headers,
rows,
metadata: {
tableName: 'Grid Table ' + (index + 1),
tableId: container.id || undefined,
tableClass: container.className || undefined,
url: document.URL,
timestamp: new Date().toISOString(),
rowCount: rows.length,
columnCount: headers.length,
tableType: 'pseudo_table'
}
});
}
}
catch (error) {
// Skip tables that can't be processed
}
});
return results;
}
/**
* Process tables for economic calendar format
*/
processEconomicCalendarData(tables) {
// Find tables that look like economic calendars
const calendarTables = tables.filter(table => {
const headers = table.headers.map(h => h.toLowerCase());
const economicTerms = ['date', 'time', 'event', 'actual', 'forecast', 'previous'];
return economicTerms.filter(term => headers.some(h => h.includes(term))).length >= 3;
});
if (calendarTables.length === 0) {
return { tables };
}
// Process each calendar table
const processedCalendars = calendarTables.map(table => {
// Map column headers to standard names
const headerMap = this.mapCalendarHeaders(table.headers);
// Extract events with standardized fields
const events = [];
let currentDate = null;
for (const row of table.rows) {
// Skip empty rows
if (Object.values(row).every(cell => !cell.rawText)) {
continue;
}
// Check if this row contains a date
const dateCell = this.extractDateFromRow(row, headerMap);
if (dateCell) {
currentDate = dateCell;
continue;
}
// Extract event data
const eventData = this.extractEventData(row, headerMap, currentDate);
if (eventData) {
events.push(eventData);
}
}
return {
metadata: {
title: "Economic Calendar",
source: table.metadata.url,
extractedAt: new Date().toISOString()
},
events: events
};
});
return {
calendar: processedCalendars[0]
};
}
/**
* Map table headers to standard economic calendar fields
*/
mapCalendarHeaders(headers) {
const headerMap = {};
headers.forEach(header => {
const lowerHeader = header.toLowerCase();
// Map date/time headers
if (lowerHeader.includes('date')) {
headerMap['date'] = header;
}
else if (lowerHeader.includes('time')) {
headerMap['time'] = header;
}
// Map event headers
if (lowerHeader.includes('event') || lowerHeader.includes('indicator') || lowerHeader.includes('description')) {
headerMap['event'] = header;
}
// Map country/currency headers
if (lowerHeader.includes('country') || lowerHeader.includes('currency')) {
headerMap['country'] = header;
}
// Map data value headers
if (lowerHeader.includes('actual') || lowerHeader === 'act') {
headerMap['actual'] = header;
}
if (lowerHeader.includes('forecast') || lowerHeader.includes('consensus') || lowerHeader === 'cons' || lowerHeader === 'fcst') {
headerMap['forecast'] = header;
}
if (lowerHeader.includes('previous') || lowerHeader === 'prev') {
headerMap['previous'] = header;
}
if (lowerHeader.includes('revised')) {
headerMap['revised'] = header;
}
// Map impact/importance headers
if (lowerHeader.includes('impact') || lowerHeader.includes('importance')) {
headerMap['impact'] = header;
}
});
return headerMap;
}
/**
* Extract a date from a row if present
*/
extractDateFromRow(row, headerMap) {
var _a;
// Check for a date in a dedicated date column
if (headerMap['date'] && ((_a = row[headerMap['date']]) === null || _a === void 0 ? void 0 : _a.rawText)) {
const dateText = row[headerMap['date']].rawText;
if (this.isDateString(dateText)) {
return dateText;
}
}
// Check for a row that contains only a date (header row)
const nonEmptyCells = Object.keys(row).filter(key => row[key].rawText);
if (nonEmptyCells.length === 1) {
const cellText = row[nonEmptyCells[0]].rawText;
if (this.isDateString(cellText)) {
return cellText;
}
}
return null;
}
/**
* Check if a string looks like a date
*/
isDateString(text) {
// Skip empty text
if (!text)
return false;
// Check for date patterns
const datePatterns = [
/^\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}$/, // DD/MM/YYYY or MM/DD/YYYY
/^\d{4}[-\/]\d{1,2}[-\/]\d{1,2}$/, // YYYY/MM/DD
/^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(st|nd|rd|th)?(,? \d{4})?$/i, // Month DD, YYYY
/^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),? [A-Za-z]+ \d{1,2}(st|nd|rd|th)?(,? \d{4})?$/i, // Weekday Month DD, YYYY
/^(Today|Tomorrow|Yesterday)$/i // Special day names
];
return datePatterns.some(pattern => pattern.test(text));
}
/**
* Extract event data from a row
*/
extractEventData(row, headerMap, currentDate) {
var _a;
// Skip rows with no event description
if (!headerMap['event'] || !((_a = row[headerMap['event']]) === null || _a === void 0 ? void 0 : _a.rawText)) {
return null;
}
// Create standardized event object
const event = {};
// Add date if available
if (currentDate) {
event['Date'] = currentDate;
}
// Map fields from headers
const fieldMappings = [
{ source: 'event', target: 'Report' },
{ source: 'time', target: 'Time' },
{ source: 'country', target: 'Country' },
{ source: 'actual', target: 'Actual' },
{ source: 'forecast', target: 'Consensus' },
{ source: 'previous', target: 'Previous' },
{ source: 'revised', target: 'Revised' },
{ source: 'impact', target: 'Impact' }
];
fieldMappings.forEach(mapping => {
var _a;
if (headerMap[mapping.source] && ((_a = row[headerMap[mapping.source]]) === null || _a === void 0 ? void 0 : _a.value) !== undefined) {
event[mapping.target] = row[headerMap[mapping.source]].value;
}
});
// Extract period if not present but in the event text
if (!event['Period'] && event['Report']) {
const periodMatch = String(event['Report']).match(/\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b|\b(Q[1-4])[ -]\d{4}\b|\b\d{4}[ -]Q[1-4]\b|\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/i);
if (periodMatch) {
event['Period'] = periodMatch[0];
}
}
return event;
}
/**
* Convert table data to a simplified format suitable for JSON export
*/
simplifyTableData(tableData) {
const simpleRows = tableData.rows.map(row => {
const simpleRow = {};
Object.entries(row).forEach(([key, cellData]) => {
// If the cell data is an object with a value property, use that
if (cellData && typeof cellData === 'object' && 'value' in cellData) {
simpleRow[key] = cellData.value;
}
else {
// Otherwise, use the data as is
simpleRow[key] = cellData;
}
});
return simpleRow;
});
// Preserve all metadata
return {
headers: tableData.headers,
rows: simpleRows,
metadata: tableData.metadata
};
}
/**
* Convert all tables to a simplified format
*/
simplifyAllTables(tables) {
return tables.map(table => this.simplifyTableData(table));
}
}
exports.TableHandler = TableHandler;