@stack.thefennec.dev/telegram-export-parser
Version:
TypeScript library for parsing Telegram Desktop's data export with full type safety
709 lines • 24.3 kB
JavaScript
;
/**
* @fileoverview Actor parsing utilities for extracting user information from Telegram export data.
*
* This module provides sophisticated actor extraction using multiple strategies with confidence scoring.
* It handles various data formats and edge cases in Telegram exports to reliably identify message senders,
* service actors, bots, and channel authors.
*
* ## Key Features
*
* - **Multi-strategy parsing** - Tries multiple approaches to extract actor data
* - **Confidence scoring** - Rates parsing reliability (0-100 scale)
* - **Fallback handling** - Graceful degradation when data is incomplete
* - **Type detection** - Automatically determines if actor is user, bot, channel, etc.
* - **Data normalization** - Cleans and standardizes actor information
*
* @example Basic Usage
* ```typescript
* import { parseActor, parseMessageSender } from './parsers/actors'
*
* // Parse from raw export data
* const actor = parseActor({
* from: 'John Doe',
* fromId: 'user123456789'
* })
*
* // Parse message sender specifically
* const sender = parseMessageSender('Jane Smith', 'user987654321')
* ```
*
* @example Advanced Usage
* ```typescript
* import { parseActorWithConfidence, parseMessageActors } from './parsers/actors'
*
* // Get confidence scoring for debugging
* const result = parseActorWithConfidence(rawData)
* console.log(`Confidence: ${result.confidence}%, Source: ${result.source}`)
*
* // Parse all actors from a message
* const primaryActor = parseMessageActors(rawTelegramMessage)
* ```
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseMessageActors = exports.parseActorWithConfidence = exports.parseActors = exports.parseUsernameMention = exports.parseChannelAuthor = exports.parseViaBot = exports.parseServiceActor = exports.parseMessageSender = exports.parseActor = void 0;
// =====================================================
// DATA NORMALIZATION UTILITIES
// =====================================================
/**
* Extracts a numeric ID from various input formats.
*
* Telegram exports sometimes contain IDs as strings with prefixes (e.g., "user123456789")
* or as pure numbers. This function normalizes them to numeric values.
*
* @param id - The ID value to extract from (string, number, or undefined)
* @returns Numeric ID or undefined if not extractable
*
* @example
* ```typescript
* extractNumericId('user123456789') // 123456789
* extractNumericId('channel987654321') // 987654321
* extractNumericId(123456789) // 123456789
* extractNumericId('invalid') // undefined
* ```
*/
const extractNumericId = (id) => {
if (typeof id === 'number')
return id;
if (!id)
return undefined;
const match = id.toString().match(/\d+/);
return match ? parseInt(match[0], 10) : undefined;
};
/**
* Normalizes username by removing @ prefix and trimming whitespace.
*
* Telegram usernames can appear with or without the @ prefix.
* This function ensures consistent format without the @ symbol.
*
* @param username - Raw username string (may include @)
* @returns Clean username without @ or undefined if invalid
*
* @example
* ```typescript
* extractUsername('@john_doe') // 'john_doe'
* extractUsername('jane_smith') // 'jane_smith'
* extractUsername(' @bot_name ') // 'bot_name'
* extractUsername('') // undefined
* ```
*/
const extractUsername = (username) => {
if (!username)
return undefined;
const cleaned = username.trim();
return cleaned.startsWith('@') ? cleaned.slice(1) : cleaned;
};
/**
* Determines the actor type based on available identifiers and context.
*
* Uses a hierarchy of detection methods:
* 1. ID prefixes (most reliable) - "user123", "channel456", "bot789"
* 2. Username patterns - usernames ending in "bot"
* 3. Context hints - "via_bot", "channel_post"
* 4. Fallback logic based on data availability
*
* @param fromId - Telegram ID string (may have prefix)
* @param username - Username for pattern detection
* @param context - Additional context hint
* @returns Determined actor type
*
* @example
* ```typescript
* determineActorType('user123456789', undefined) // 'user'
* determineActorType('channel987654321', undefined) // 'channel'
* determineActorType(undefined, 'helper_bot') // 'bot'
* determineActorType(undefined, undefined, 'via_bot') // 'bot'
* determineActorType(undefined, undefined) // 'deleted_user'
* ```
*/
const determineActorType = (fromId, username, context) => {
// Check fromId patterns first (most reliable)
if (fromId) {
const lowerFromId = fromId.toLowerCase();
if (lowerFromId.startsWith('user'))
return 'user';
if (lowerFromId.startsWith('channel'))
return 'channel';
if (lowerFromId.startsWith('bot'))
return 'bot';
}
// Check username patterns
if (username) {
const lowerUsername = username.toLowerCase();
if (lowerUsername.endsWith('bot'))
return 'bot';
}
// Context-based detection
if (context === 'via_bot')
return 'bot';
if (context === 'channel_post')
return 'channel';
// Default fallback
return fromId ? 'user' : 'deleted_user';
};
/**
* Normalizes display names with consistent handling of special cases.
*
* Cleans up display names from Telegram exports, handling edge cases like:
* - Empty or whitespace-only names
* - Deleted/removed account indicators
* - Inconsistent spacing and formatting
*
* @param displayName - Raw display name from export
* @returns Normalized display name (never empty)
*
* @example
* ```typescript
* normalizeDisplayName(' John Doe ') // 'John Doe'
* normalizeDisplayName('deleted user') // 'Deleted User'
* normalizeDisplayName('removed account') // 'Deleted User'
* normalizeDisplayName('') // 'Unknown'
* normalizeDisplayName(undefined) // 'Unknown'
* ```
*/
const normalizeDisplayName = (displayName) => {
if (!displayName?.trim())
return 'Unknown';
const normalized = displayName.trim();
// Handle common deleted/unknown patterns
if (/^(deleted|removed|unknown)/i.test(normalized)) {
return 'Deleted User';
}
return normalized;
};
// =====================================================
// MULTI-STRATEGY ACTOR EXTRACTION ENGINE
// =====================================================
/**
* Advanced actor extraction engine using multiple parsing strategies.
*
* Implements the Strategy pattern to try different approaches for extracting
* actor information from raw Telegram export data. Each strategy targets
* specific data patterns and provides confidence scoring.
*
* ## Strategy Priority Order:
* 1. **Primary Sender** (95% confidence) - Regular message sender
* 2. **Service Actor** (90% confidence) - Service message actor
* 3. **Channel Author** (85% confidence) - Channel post author
* 4. **Via Bot** (70% confidence) - Bot used to send message
* 5. **Username Mention** (50-80% confidence) - Username-based extraction
* 6. **Display Name Fallback** (30% confidence) - Display name only
*
* @example
* ```typescript
* const extractor = new ActorExtractor()
* const result = extractor.extract({
* from: 'John Doe',
* fromId: 'user123456789'
* })
* console.log(`Confidence: ${result.confidence}%`) // 95%
* ```
*/
class ActorExtractor {
/**
* Strategy 1: Extract primary message sender information.
*
* Handles regular user messages with 'from' and 'from_id' fields.
* This is the most common and reliable data pattern in exports.
*
* **Confidence Scoring:**
* - Both name and ID: 95% (highest reliability)
* - ID only: 80% (good, can look up name later)
* - Name only: 60% (lower, no unique identifier)
*
* @param data - Raw actor data
* @returns Parse result with primary sender actor
*/
extractPrimarySender(data) {
const { from, fromId } = data;
if (!from && !fromId) {
return { actor: null, confidence: 0, source: 'primary_sender_missing' };
}
const id = extractNumericId(fromId);
const type = determineActorType(fromId, undefined);
const displayName = normalizeDisplayName(from);
return {
actor: {
id,
type,
username: undefined, // Not available in primary sender data
displayName,
authoredBy: undefined
},
confidence: (from && fromId) ? 95 : (fromId ? 80 : 60),
source: 'primary_sender'
};
}
/**
* Strategy 2: Extract service message actor information.
*
* Handles service messages (calls, joins, etc.) with 'actor' and 'actor_id' fields.
* These messages indicate system actions performed by users.
*
* **Confidence Scoring:**
* - Both name and ID: 90% (very reliable)
* - ID only: 75% (good identification)
* - Name only: 55% (moderate reliability)
*
* @param data - Raw actor data
* @returns Parse result with service actor
*/
extractServiceActor(data) {
const { actor, actorId } = data;
if (!actor && !actorId) {
return { actor: null, confidence: 0, source: 'service_actor_missing' };
}
const id = extractNumericId(actorId);
const type = determineActorType(actorId, undefined);
const displayName = normalizeDisplayName(actor);
return {
actor: {
id,
type,
username: undefined,
displayName,
authoredBy: undefined
},
confidence: (actor && actorId) ? 90 : (actorId ? 75 : 55),
source: 'service_actor'
};
}
/**
* Strategy 3: Extract via bot information.
*
* Handles messages sent through bots using the 'via_bot' field.
* Usually contains only the bot's username without ID.
*
* **Confidence:** 70% (moderate - username only, no ID)
*
* @param data - Raw actor data
* @returns Parse result with via bot actor
*/
extractViaBot(data) {
const { viaBot } = data;
if (!viaBot) {
return { actor: null, confidence: 0, source: 'via_bot_missing' };
}
const username = extractUsername(viaBot);
const displayName = normalizeDisplayName(viaBot);
return {
actor: {
id: undefined, // ID not available from username only
type: 'bot',
username,
displayName,
authoredBy: undefined
},
confidence: 70, // Lower confidence due to missing ID
source: 'via_bot'
};
}
/**
* Strategy 4: Extract channel author information.
*
* Handles channel posts with 'author' field indicating who authored
* the content on behalf of the channel.
*
* **Confidence:** 85% (high - specific to channel context)
*
* @param data - Raw actor data
* @returns Parse result with channel author
*/
extractChannelAuthor(data) {
const { author, fromId } = data;
if (!author) {
return { actor: null, confidence: 0, source: 'channel_author_missing' };
}
const id = extractNumericId(fromId);
const type = fromId?.toLowerCase().includes('channel') ? 'channel_author' : 'user';
const displayName = normalizeDisplayName(author);
return {
actor: {
id,
type,
username: undefined,
displayName,
authoredBy: author // Channel author case
},
confidence: 85,
source: 'channel_author'
};
}
/**
* Strategy 5: Extract from username mention.
*
* Handles cases where only username is available, typically from
* mentions or references within message content.
*
* **Confidence Scoring:**
* - Username + ID: 80% (good identification)
* - Username only: 50% (moderate - no unique ID)
*
* @param data - Raw actor data
* @returns Parse result with username-based actor
*/
extractFromUsername(data) {
const { username, userId } = data;
if (!username) {
return { actor: null, confidence: 0, source: 'username_missing' };
}
const cleanUsername = extractUsername(username);
const type = determineActorType(undefined, cleanUsername);
return {
actor: {
id: extractNumericId(userId),
type,
username: cleanUsername,
displayName: `@${cleanUsername}`,
authoredBy: undefined
},
confidence: userId ? 80 : 50, // Lower confidence without ID
source: 'username_mention'
};
}
/**
* Strategy 6: Fallback extraction from display name only.
*
* Last resort strategy when only a display name is available.
* Provides minimal actor information with low confidence.
*
* **Confidence:** 30% (low - display name only, no unique identifiers)
*
* @param data - Raw actor data
* @returns Parse result with display name-based actor
*/
extractFromDisplayName(data) {
const { displayName } = data;
if (!displayName) {
return { actor: null, confidence: 0, source: 'display_name_missing' };
}
return {
actor: {
id: undefined,
type: 'user',
username: undefined,
displayName: normalizeDisplayName(displayName),
authoredBy: undefined
},
confidence: 30, // Very low confidence - display name only
source: 'display_name_fallback'
};
}
/**
* Main extraction method that orchestrates all strategies.
*
* Tries each extraction strategy in priority order, returning the first
* successful result. If all strategies fail, returns a fallback actor
* representing an unknown/deleted user.
*
* **Strategy Priority:**
* 1. Primary Sender (highest confidence)
* 2. Service Actor
* 3. Channel Author
* 4. Via Bot
* 5. Username Mention
* 6. Display Name Fallback (lowest confidence)
*
* @param data - Raw actor data from Telegram export
* @returns Actor parse result with confidence scoring
*
* @example
* ```typescript
* const result = extractor.extract({
* from: 'John Doe',
* fromId: 'user123456789',
* username: 'john_doe'
* })
* // Uses primary sender strategy (highest priority)
* // Returns 95% confidence result
* ```
*/
extract(data) {
const strategies = [
() => this.extractPrimarySender(data),
() => this.extractServiceActor(data),
() => this.extractChannelAuthor(data),
() => this.extractViaBot(data),
() => this.extractFromUsername(data),
() => this.extractFromDisplayName(data)
];
// Try strategies in order, return first successful result
for (const strategy of strategies) {
const result = strategy();
if (result.actor) {
return result;
}
}
// Ultimate fallback for completely missing data
return {
actor: {
id: undefined,
type: 'deleted_user',
username: undefined,
displayName: 'Unknown',
authoredBy: undefined
},
confidence: 0,
source: 'fallback'
};
}
}
// =====================================================
// PUBLIC API
// =====================================================
/** Singleton extractor instance for efficient reuse */
const extractor = new ActorExtractor();
/**
* Main actor parsing function with multi-strategy extraction.
*
* Uses sophisticated parsing strategies to extract actor information from
* raw Telegram export data. Automatically selects the best available data
* and provides confidence scoring for quality assessment.
*
* @param data - Raw actor data from Telegram export
* @returns Parsed actor object or null if extraction fails
*
* @example
* ```typescript
* // Parse regular message sender
* const actor = parseActor({
* from: 'John Doe',
* fromId: 'user123456789'
* })
*
* // Parse service message actor
* const serviceActor = parseActor({
* actor: 'Admin User',
* actorId: 'user987654321'
* })
*
* // Parse with mixed data
* const complexActor = parseActor({
* from: 'Bot Name',
* fromId: 'user555666777',
* username: 'helper_bot',
* viaBot: '@another_bot'
* })
* ```
*/
const parseActor = (data) => {
const result = extractor.extract(data);
return result.actor;
};
exports.parseActor = parseActor;
// =====================================================
// SPECIALIZED PARSING FUNCTIONS
// =====================================================
/**
* Parse regular message sender from 'from' and 'from_id' fields.
*
* Optimized for the most common case of regular user messages.
* Provides clean API for message parsing code.
*
* @param from - Display name of the sender
* @param fromId - Telegram ID of the sender
* @returns Parsed actor or null if insufficient data
*
* @example
* ```typescript
* const sender = parseMessageSender('Jane Doe', 'user987654321')
* console.log(sender?.displayName) // 'Jane Doe'
* console.log(sender?.id) // 987654321
* ```
*/
const parseMessageSender = (from, fromId) => {
return (0, exports.parseActor)({ from, fromId });
};
exports.parseMessageSender = parseMessageSender;
/**
* Parse service message actor from 'actor' and 'actor_id' fields.
*
* Service messages represent system actions like user joins, calls, etc.
* This function specifically handles the actor who performed the action.
*
* @param actor - Display name of the service actor
* @param actorId - Telegram ID of the service actor
* @returns Parsed actor or null if insufficient data
*
* @example
* ```typescript
* const serviceActor = parseServiceActor('Admin User', 'user123456789')
* // Use for: "Admin User invited 3 users to the group"
* ```
*/
const parseServiceActor = (actor, actorId) => {
return (0, exports.parseActor)({ actor, actorId });
};
exports.parseServiceActor = parseServiceActor;
/**
* Parse bot actor from 'via_bot' field.
*
* When messages are sent through bots, the via_bot field contains
* the bot's username. This function extracts that bot information.
*
* @param viaBot - Bot username (may include @ prefix)
* @returns Parsed bot actor or null if invalid
*
* @example
* ```typescript
* const bot = parseViaBot('@helper_bot')
* console.log(bot?.type) // 'bot'
* console.log(bot?.username) // 'helper_bot'
* ```
*/
const parseViaBot = (viaBot) => {
return (0, exports.parseActor)({ viaBot });
};
exports.parseViaBot = parseViaBot;
/**
* Parse channel author from 'author' field.
*
* In channel posts, the author field indicates who wrote the content
* on behalf of the channel. Useful for attributed channel content.
*
* @param author - Author name from channel post
* @param fromId - Optional channel/author ID
* @returns Parsed channel author or null if invalid
*
* @example
* ```typescript
* const author = parseChannelAuthor('Content Creator', 'channel123456789')
* console.log(author?.type) // 'channel_author'
* console.log(author?.authoredBy) // 'Content Creator'
* ```
*/
const parseChannelAuthor = (author, fromId) => {
return (0, exports.parseActor)({ author, fromId });
};
exports.parseChannelAuthor = parseChannelAuthor;
/**
* Parse actor from username mention data.
*
* Useful when processing mentions within message content or
* when only username information is available.
*
* @param username - Username (with or without @ prefix)
* @param userId - Optional numeric user ID
* @returns Parsed actor or null if invalid username
*
* @example
* ```typescript
* const mentioned = parseUsernameMention('@john_doe', 123456789)
* const usernameOnly = parseUsernameMention('jane_smith') // Lower confidence
* ```
*/
const parseUsernameMention = (username, userId) => {
return (0, exports.parseActor)({ username, userId });
};
exports.parseUsernameMention = parseUsernameMention;
// =====================================================
// BATCH AND ADVANCED PROCESSING
// =====================================================
/**
* Parse multiple actors from an array of raw data.
*
* Efficiently processes multiple actor entries and filters out failed parses.
* Useful for batch processing of export data.
*
* @param dataArray - Array of raw actor data objects
* @returns Array of successfully parsed actors (failures are excluded)
*
* @example
* ```typescript
* const rawActors = [
* { from: 'User 1', fromId: 'user111' },
* { from: 'User 2', fromId: 'user222' },
* { from: '', fromId: '' }, // This will be filtered out
* ]
*
* const actors = parseActors(rawActors)
* console.log(actors.length) // 2 (invalid entry filtered out)
* ```
*/
const parseActors = (dataArray) => {
return dataArray
.map(exports.parseActor)
.filter((actor) => actor !== null);
};
exports.parseActors = parseActors;
/**
* Parse actor with detailed confidence and debugging information.
*
* Returns full parsing result including confidence score and strategy used.
* Useful for debugging, quality assessment, and analytics.
*
* @param data - Raw actor data from Telegram export
* @returns Complete parse result with confidence metadata
*
* @example
* ```typescript
* const result = parseActorWithConfidence({
* from: 'John Doe',
* fromId: 'user123456789'
* })
*
* console.log(`Actor: ${result.actor?.displayName}`)
* console.log(`Confidence: ${result.confidence}%`)
* console.log(`Strategy: ${result.source}`)
*
* if (result.confidence < 70) {
* console.log('Low confidence - review data quality')
* }
* ```
*/
const parseActorWithConfidence = (data) => {
return extractor.extract(data);
};
exports.parseActorWithConfidence = parseActorWithConfidence;
/**
* Extract the primary actor from a complete raw Telegram message.
*
* Convenience function that tries multiple actor fields from a message
* and returns the most relevant one based on priority logic.
*
* **Priority Order:**
* 1. Message sender (from/from_id) - highest priority
* 2. Service actor (actor/actor_id)
* 3. Channel author (author)
* 4. Via bot (via_bot) - lowest priority
*
* @param rawMessage - Raw message object from Telegram export
* @returns Most relevant actor or null if no valid actor found
*
* @example
* ```typescript
* // Regular user message
* const userMsg = {
* from: 'John Doe',
* from_id: 'user123456789'
* }
* const actor1 = parseMessageActors(userMsg) // Returns John Doe
*
* // Service message with actor
* const serviceMsg = {
* actor: 'Admin User',
* actor_id: 'user987654321'
* }
* const actor2 = parseMessageActors(serviceMsg) // Returns Admin User
*
* // Complex message with multiple actors
* const complexMsg = {
* from: 'Bot Name',
* from_id: 'user555666777',
* via_bot: '@helper_bot'
* }
* const actor3 = parseMessageActors(complexMsg) // Returns Bot Name (higher priority)
* ```
*/
const parseMessageActors = (rawMessage) => {
const actors = {
sender: (0, exports.parseMessageSender)(rawMessage.from, rawMessage.from_id),
serviceActor: (0, exports.parseServiceActor)(rawMessage.actor, rawMessage.actor_id),
channelAuthor: (0, exports.parseChannelAuthor)(rawMessage.author, rawMessage.from_id),
viaBot: (0, exports.parseViaBot)(rawMessage.via_bot)
};
// Return the most relevant actor (priority: sender > serviceActor > channelAuthor > viaBot)
return actors.sender ?? actors.serviceActor ?? actors.channelAuthor ?? actors.viaBot ?? null;
};
exports.parseMessageActors = parseMessageActors;
//# sourceMappingURL=actors.js.map