UNPKG

@stack.thefennec.dev/telegram-export-parser

Version:

TypeScript library for parsing Telegram Desktop's data export with full type safety

709 lines 24.3 kB
"use strict"; /** * @fileoverview Actor parsing utilities for extracting user information from Telegram export data. * * This module provides sophisticated actor extraction using multiple strategies with confidence scoring. * It handles various data formats and edge cases in Telegram exports to reliably identify message senders, * service actors, bots, and channel authors. * * ## Key Features * * - **Multi-strategy parsing** - Tries multiple approaches to extract actor data * - **Confidence scoring** - Rates parsing reliability (0-100 scale) * - **Fallback handling** - Graceful degradation when data is incomplete * - **Type detection** - Automatically determines if actor is user, bot, channel, etc. * - **Data normalization** - Cleans and standardizes actor information * * @example Basic Usage * ```typescript * import { parseActor, parseMessageSender } from './parsers/actors' * * // Parse from raw export data * const actor = parseActor({ * from: 'John Doe', * fromId: 'user123456789' * }) * * // Parse message sender specifically * const sender = parseMessageSender('Jane Smith', 'user987654321') * ``` * * @example Advanced Usage * ```typescript * import { parseActorWithConfidence, parseMessageActors } from './parsers/actors' * * // Get confidence scoring for debugging * const result = parseActorWithConfidence(rawData) * console.log(`Confidence: ${result.confidence}%, Source: ${result.source}`) * * // Parse all actors from a message * const primaryActor = parseMessageActors(rawTelegramMessage) * ``` */ Object.defineProperty(exports, "__esModule", { value: true }); exports.parseMessageActors = exports.parseActorWithConfidence = exports.parseActors = exports.parseUsernameMention = exports.parseChannelAuthor = exports.parseViaBot = exports.parseServiceActor = exports.parseMessageSender = exports.parseActor = void 0; // ===================================================== // DATA NORMALIZATION UTILITIES // ===================================================== /** * Extracts a numeric ID from various input formats. * * Telegram exports sometimes contain IDs as strings with prefixes (e.g., "user123456789") * or as pure numbers. This function normalizes them to numeric values. * * @param id - The ID value to extract from (string, number, or undefined) * @returns Numeric ID or undefined if not extractable * * @example * ```typescript * extractNumericId('user123456789') // 123456789 * extractNumericId('channel987654321') // 987654321 * extractNumericId(123456789) // 123456789 * extractNumericId('invalid') // undefined * ``` */ const extractNumericId = (id) => { if (typeof id === 'number') return id; if (!id) return undefined; const match = id.toString().match(/\d+/); return match ? parseInt(match[0], 10) : undefined; }; /** * Normalizes username by removing @ prefix and trimming whitespace. * * Telegram usernames can appear with or without the @ prefix. * This function ensures consistent format without the @ symbol. * * @param username - Raw username string (may include @) * @returns Clean username without @ or undefined if invalid * * @example * ```typescript * extractUsername('@john_doe') // 'john_doe' * extractUsername('jane_smith') // 'jane_smith' * extractUsername(' @bot_name ') // 'bot_name' * extractUsername('') // undefined * ``` */ const extractUsername = (username) => { if (!username) return undefined; const cleaned = username.trim(); return cleaned.startsWith('@') ? cleaned.slice(1) : cleaned; }; /** * Determines the actor type based on available identifiers and context. * * Uses a hierarchy of detection methods: * 1. ID prefixes (most reliable) - "user123", "channel456", "bot789" * 2. Username patterns - usernames ending in "bot" * 3. Context hints - "via_bot", "channel_post" * 4. Fallback logic based on data availability * * @param fromId - Telegram ID string (may have prefix) * @param username - Username for pattern detection * @param context - Additional context hint * @returns Determined actor type * * @example * ```typescript * determineActorType('user123456789', undefined) // 'user' * determineActorType('channel987654321', undefined) // 'channel' * determineActorType(undefined, 'helper_bot') // 'bot' * determineActorType(undefined, undefined, 'via_bot') // 'bot' * determineActorType(undefined, undefined) // 'deleted_user' * ``` */ const determineActorType = (fromId, username, context) => { // Check fromId patterns first (most reliable) if (fromId) { const lowerFromId = fromId.toLowerCase(); if (lowerFromId.startsWith('user')) return 'user'; if (lowerFromId.startsWith('channel')) return 'channel'; if (lowerFromId.startsWith('bot')) return 'bot'; } // Check username patterns if (username) { const lowerUsername = username.toLowerCase(); if (lowerUsername.endsWith('bot')) return 'bot'; } // Context-based detection if (context === 'via_bot') return 'bot'; if (context === 'channel_post') return 'channel'; // Default fallback return fromId ? 'user' : 'deleted_user'; }; /** * Normalizes display names with consistent handling of special cases. * * Cleans up display names from Telegram exports, handling edge cases like: * - Empty or whitespace-only names * - Deleted/removed account indicators * - Inconsistent spacing and formatting * * @param displayName - Raw display name from export * @returns Normalized display name (never empty) * * @example * ```typescript * normalizeDisplayName(' John Doe ') // 'John Doe' * normalizeDisplayName('deleted user') // 'Deleted User' * normalizeDisplayName('removed account') // 'Deleted User' * normalizeDisplayName('') // 'Unknown' * normalizeDisplayName(undefined) // 'Unknown' * ``` */ const normalizeDisplayName = (displayName) => { if (!displayName?.trim()) return 'Unknown'; const normalized = displayName.trim(); // Handle common deleted/unknown patterns if (/^(deleted|removed|unknown)/i.test(normalized)) { return 'Deleted User'; } return normalized; }; // ===================================================== // MULTI-STRATEGY ACTOR EXTRACTION ENGINE // ===================================================== /** * Advanced actor extraction engine using multiple parsing strategies. * * Implements the Strategy pattern to try different approaches for extracting * actor information from raw Telegram export data. Each strategy targets * specific data patterns and provides confidence scoring. * * ## Strategy Priority Order: * 1. **Primary Sender** (95% confidence) - Regular message sender * 2. **Service Actor** (90% confidence) - Service message actor * 3. **Channel Author** (85% confidence) - Channel post author * 4. **Via Bot** (70% confidence) - Bot used to send message * 5. **Username Mention** (50-80% confidence) - Username-based extraction * 6. **Display Name Fallback** (30% confidence) - Display name only * * @example * ```typescript * const extractor = new ActorExtractor() * const result = extractor.extract({ * from: 'John Doe', * fromId: 'user123456789' * }) * console.log(`Confidence: ${result.confidence}%`) // 95% * ``` */ class ActorExtractor { /** * Strategy 1: Extract primary message sender information. * * Handles regular user messages with 'from' and 'from_id' fields. * This is the most common and reliable data pattern in exports. * * **Confidence Scoring:** * - Both name and ID: 95% (highest reliability) * - ID only: 80% (good, can look up name later) * - Name only: 60% (lower, no unique identifier) * * @param data - Raw actor data * @returns Parse result with primary sender actor */ extractPrimarySender(data) { const { from, fromId } = data; if (!from && !fromId) { return { actor: null, confidence: 0, source: 'primary_sender_missing' }; } const id = extractNumericId(fromId); const type = determineActorType(fromId, undefined); const displayName = normalizeDisplayName(from); return { actor: { id, type, username: undefined, // Not available in primary sender data displayName, authoredBy: undefined }, confidence: (from && fromId) ? 95 : (fromId ? 80 : 60), source: 'primary_sender' }; } /** * Strategy 2: Extract service message actor information. * * Handles service messages (calls, joins, etc.) with 'actor' and 'actor_id' fields. * These messages indicate system actions performed by users. * * **Confidence Scoring:** * - Both name and ID: 90% (very reliable) * - ID only: 75% (good identification) * - Name only: 55% (moderate reliability) * * @param data - Raw actor data * @returns Parse result with service actor */ extractServiceActor(data) { const { actor, actorId } = data; if (!actor && !actorId) { return { actor: null, confidence: 0, source: 'service_actor_missing' }; } const id = extractNumericId(actorId); const type = determineActorType(actorId, undefined); const displayName = normalizeDisplayName(actor); return { actor: { id, type, username: undefined, displayName, authoredBy: undefined }, confidence: (actor && actorId) ? 90 : (actorId ? 75 : 55), source: 'service_actor' }; } /** * Strategy 3: Extract via bot information. * * Handles messages sent through bots using the 'via_bot' field. * Usually contains only the bot's username without ID. * * **Confidence:** 70% (moderate - username only, no ID) * * @param data - Raw actor data * @returns Parse result with via bot actor */ extractViaBot(data) { const { viaBot } = data; if (!viaBot) { return { actor: null, confidence: 0, source: 'via_bot_missing' }; } const username = extractUsername(viaBot); const displayName = normalizeDisplayName(viaBot); return { actor: { id: undefined, // ID not available from username only type: 'bot', username, displayName, authoredBy: undefined }, confidence: 70, // Lower confidence due to missing ID source: 'via_bot' }; } /** * Strategy 4: Extract channel author information. * * Handles channel posts with 'author' field indicating who authored * the content on behalf of the channel. * * **Confidence:** 85% (high - specific to channel context) * * @param data - Raw actor data * @returns Parse result with channel author */ extractChannelAuthor(data) { const { author, fromId } = data; if (!author) { return { actor: null, confidence: 0, source: 'channel_author_missing' }; } const id = extractNumericId(fromId); const type = fromId?.toLowerCase().includes('channel') ? 'channel_author' : 'user'; const displayName = normalizeDisplayName(author); return { actor: { id, type, username: undefined, displayName, authoredBy: author // Channel author case }, confidence: 85, source: 'channel_author' }; } /** * Strategy 5: Extract from username mention. * * Handles cases where only username is available, typically from * mentions or references within message content. * * **Confidence Scoring:** * - Username + ID: 80% (good identification) * - Username only: 50% (moderate - no unique ID) * * @param data - Raw actor data * @returns Parse result with username-based actor */ extractFromUsername(data) { const { username, userId } = data; if (!username) { return { actor: null, confidence: 0, source: 'username_missing' }; } const cleanUsername = extractUsername(username); const type = determineActorType(undefined, cleanUsername); return { actor: { id: extractNumericId(userId), type, username: cleanUsername, displayName: `@${cleanUsername}`, authoredBy: undefined }, confidence: userId ? 80 : 50, // Lower confidence without ID source: 'username_mention' }; } /** * Strategy 6: Fallback extraction from display name only. * * Last resort strategy when only a display name is available. * Provides minimal actor information with low confidence. * * **Confidence:** 30% (low - display name only, no unique identifiers) * * @param data - Raw actor data * @returns Parse result with display name-based actor */ extractFromDisplayName(data) { const { displayName } = data; if (!displayName) { return { actor: null, confidence: 0, source: 'display_name_missing' }; } return { actor: { id: undefined, type: 'user', username: undefined, displayName: normalizeDisplayName(displayName), authoredBy: undefined }, confidence: 30, // Very low confidence - display name only source: 'display_name_fallback' }; } /** * Main extraction method that orchestrates all strategies. * * Tries each extraction strategy in priority order, returning the first * successful result. If all strategies fail, returns a fallback actor * representing an unknown/deleted user. * * **Strategy Priority:** * 1. Primary Sender (highest confidence) * 2. Service Actor * 3. Channel Author * 4. Via Bot * 5. Username Mention * 6. Display Name Fallback (lowest confidence) * * @param data - Raw actor data from Telegram export * @returns Actor parse result with confidence scoring * * @example * ```typescript * const result = extractor.extract({ * from: 'John Doe', * fromId: 'user123456789', * username: 'john_doe' * }) * // Uses primary sender strategy (highest priority) * // Returns 95% confidence result * ``` */ extract(data) { const strategies = [ () => this.extractPrimarySender(data), () => this.extractServiceActor(data), () => this.extractChannelAuthor(data), () => this.extractViaBot(data), () => this.extractFromUsername(data), () => this.extractFromDisplayName(data) ]; // Try strategies in order, return first successful result for (const strategy of strategies) { const result = strategy(); if (result.actor) { return result; } } // Ultimate fallback for completely missing data return { actor: { id: undefined, type: 'deleted_user', username: undefined, displayName: 'Unknown', authoredBy: undefined }, confidence: 0, source: 'fallback' }; } } // ===================================================== // PUBLIC API // ===================================================== /** Singleton extractor instance for efficient reuse */ const extractor = new ActorExtractor(); /** * Main actor parsing function with multi-strategy extraction. * * Uses sophisticated parsing strategies to extract actor information from * raw Telegram export data. Automatically selects the best available data * and provides confidence scoring for quality assessment. * * @param data - Raw actor data from Telegram export * @returns Parsed actor object or null if extraction fails * * @example * ```typescript * // Parse regular message sender * const actor = parseActor({ * from: 'John Doe', * fromId: 'user123456789' * }) * * // Parse service message actor * const serviceActor = parseActor({ * actor: 'Admin User', * actorId: 'user987654321' * }) * * // Parse with mixed data * const complexActor = parseActor({ * from: 'Bot Name', * fromId: 'user555666777', * username: 'helper_bot', * viaBot: '@another_bot' * }) * ``` */ const parseActor = (data) => { const result = extractor.extract(data); return result.actor; }; exports.parseActor = parseActor; // ===================================================== // SPECIALIZED PARSING FUNCTIONS // ===================================================== /** * Parse regular message sender from 'from' and 'from_id' fields. * * Optimized for the most common case of regular user messages. * Provides clean API for message parsing code. * * @param from - Display name of the sender * @param fromId - Telegram ID of the sender * @returns Parsed actor or null if insufficient data * * @example * ```typescript * const sender = parseMessageSender('Jane Doe', 'user987654321') * console.log(sender?.displayName) // 'Jane Doe' * console.log(sender?.id) // 987654321 * ``` */ const parseMessageSender = (from, fromId) => { return (0, exports.parseActor)({ from, fromId }); }; exports.parseMessageSender = parseMessageSender; /** * Parse service message actor from 'actor' and 'actor_id' fields. * * Service messages represent system actions like user joins, calls, etc. * This function specifically handles the actor who performed the action. * * @param actor - Display name of the service actor * @param actorId - Telegram ID of the service actor * @returns Parsed actor or null if insufficient data * * @example * ```typescript * const serviceActor = parseServiceActor('Admin User', 'user123456789') * // Use for: "Admin User invited 3 users to the group" * ``` */ const parseServiceActor = (actor, actorId) => { return (0, exports.parseActor)({ actor, actorId }); }; exports.parseServiceActor = parseServiceActor; /** * Parse bot actor from 'via_bot' field. * * When messages are sent through bots, the via_bot field contains * the bot's username. This function extracts that bot information. * * @param viaBot - Bot username (may include @ prefix) * @returns Parsed bot actor or null if invalid * * @example * ```typescript * const bot = parseViaBot('@helper_bot') * console.log(bot?.type) // 'bot' * console.log(bot?.username) // 'helper_bot' * ``` */ const parseViaBot = (viaBot) => { return (0, exports.parseActor)({ viaBot }); }; exports.parseViaBot = parseViaBot; /** * Parse channel author from 'author' field. * * In channel posts, the author field indicates who wrote the content * on behalf of the channel. Useful for attributed channel content. * * @param author - Author name from channel post * @param fromId - Optional channel/author ID * @returns Parsed channel author or null if invalid * * @example * ```typescript * const author = parseChannelAuthor('Content Creator', 'channel123456789') * console.log(author?.type) // 'channel_author' * console.log(author?.authoredBy) // 'Content Creator' * ``` */ const parseChannelAuthor = (author, fromId) => { return (0, exports.parseActor)({ author, fromId }); }; exports.parseChannelAuthor = parseChannelAuthor; /** * Parse actor from username mention data. * * Useful when processing mentions within message content or * when only username information is available. * * @param username - Username (with or without @ prefix) * @param userId - Optional numeric user ID * @returns Parsed actor or null if invalid username * * @example * ```typescript * const mentioned = parseUsernameMention('@john_doe', 123456789) * const usernameOnly = parseUsernameMention('jane_smith') // Lower confidence * ``` */ const parseUsernameMention = (username, userId) => { return (0, exports.parseActor)({ username, userId }); }; exports.parseUsernameMention = parseUsernameMention; // ===================================================== // BATCH AND ADVANCED PROCESSING // ===================================================== /** * Parse multiple actors from an array of raw data. * * Efficiently processes multiple actor entries and filters out failed parses. * Useful for batch processing of export data. * * @param dataArray - Array of raw actor data objects * @returns Array of successfully parsed actors (failures are excluded) * * @example * ```typescript * const rawActors = [ * { from: 'User 1', fromId: 'user111' }, * { from: 'User 2', fromId: 'user222' }, * { from: '', fromId: '' }, // This will be filtered out * ] * * const actors = parseActors(rawActors) * console.log(actors.length) // 2 (invalid entry filtered out) * ``` */ const parseActors = (dataArray) => { return dataArray .map(exports.parseActor) .filter((actor) => actor !== null); }; exports.parseActors = parseActors; /** * Parse actor with detailed confidence and debugging information. * * Returns full parsing result including confidence score and strategy used. * Useful for debugging, quality assessment, and analytics. * * @param data - Raw actor data from Telegram export * @returns Complete parse result with confidence metadata * * @example * ```typescript * const result = parseActorWithConfidence({ * from: 'John Doe', * fromId: 'user123456789' * }) * * console.log(`Actor: ${result.actor?.displayName}`) * console.log(`Confidence: ${result.confidence}%`) * console.log(`Strategy: ${result.source}`) * * if (result.confidence < 70) { * console.log('Low confidence - review data quality') * } * ``` */ const parseActorWithConfidence = (data) => { return extractor.extract(data); }; exports.parseActorWithConfidence = parseActorWithConfidence; /** * Extract the primary actor from a complete raw Telegram message. * * Convenience function that tries multiple actor fields from a message * and returns the most relevant one based on priority logic. * * **Priority Order:** * 1. Message sender (from/from_id) - highest priority * 2. Service actor (actor/actor_id) * 3. Channel author (author) * 4. Via bot (via_bot) - lowest priority * * @param rawMessage - Raw message object from Telegram export * @returns Most relevant actor or null if no valid actor found * * @example * ```typescript * // Regular user message * const userMsg = { * from: 'John Doe', * from_id: 'user123456789' * } * const actor1 = parseMessageActors(userMsg) // Returns John Doe * * // Service message with actor * const serviceMsg = { * actor: 'Admin User', * actor_id: 'user987654321' * } * const actor2 = parseMessageActors(serviceMsg) // Returns Admin User * * // Complex message with multiple actors * const complexMsg = { * from: 'Bot Name', * from_id: 'user555666777', * via_bot: '@helper_bot' * } * const actor3 = parseMessageActors(complexMsg) // Returns Bot Name (higher priority) * ``` */ const parseMessageActors = (rawMessage) => { const actors = { sender: (0, exports.parseMessageSender)(rawMessage.from, rawMessage.from_id), serviceActor: (0, exports.parseServiceActor)(rawMessage.actor, rawMessage.actor_id), channelAuthor: (0, exports.parseChannelAuthor)(rawMessage.author, rawMessage.from_id), viaBot: (0, exports.parseViaBot)(rawMessage.via_bot) }; // Return the most relevant actor (priority: sender > serviceActor > channelAuthor > viaBot) return actors.sender ?? actors.serviceActor ?? actors.channelAuthor ?? actors.viaBot ?? null; }; exports.parseMessageActors = parseMessageActors; //# sourceMappingURL=actors.js.map