@stack.thefennec.dev/telegram-export-parser
Version:
TypeScript library for parsing Telegram Desktop's data export with full type safety
297 lines • 11.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.processTextEntities = exports.cleanupMarkdown = exports.ENTITY_PARSERS = void 0;
const types_1 = require("../types");
const parser_1 = require("../core/parser");
const base_1 = require("./base");
const actors_1 = require("./actors");
// =====================================================
// UTILITIES - IMMUTABLE PURE FUNCTIONS
// =====================================================
const getUrlForType = (type, text) => {
switch (type) {
case types_1.TEXT_ENTITY_TYPES.HASHTAG:
case types_1.TEXT_ENTITY_TYPES.CASHTAG:
case types_1.TEXT_ENTITY_TYPES.BOT_COMMAND:
return `#${text}`;
case types_1.TEXT_ENTITY_TYPES.EMAIL:
return `mailto:${text}`;
case types_1.TEXT_ENTITY_TYPES.PHONE:
return `tel:${text.replace(/\s+/g, '')}`;
case types_1.TEXT_ENTITY_TYPES.MENTION:
return `https://t.me/${text}`;
case types_1.TEXT_ENTITY_TYPES.MENTION_NAME:
return `tg://user?id=${text}`;
case types_1.TEXT_ENTITY_TYPES.LINK:
default:
return text;
}
};
const escapeHtml = (text) => text
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, ''');
const createBaseEntity = (raw, converter) => ({
text: raw.text,
toMarkdown: () => converter.toMarkdown(raw),
toHTML: () => converter.toHTML(raw)
});
// =====================================================
// TEXT ENTITY PARSERS
// =====================================================
const plainTextEntityParser = (0, parser_1.createParser)({
name: 'plain-text-entity',
priority: 10, // Lowest priority - fallback
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.PLAIN,
parse: (raw) => createBaseEntity(raw, {
toMarkdown: (entity) => entity.text,
toHTML: (entity) => escapeHtml(entity.text)
})
});
const boldTextEntityParser = (0, parser_1.createParser)({
name: 'bold-text-entity',
priority: 80,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.BOLD,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => `**${entity.text}**`,
toHTML: (entity) => `<strong>${escapeHtml(entity.text)}</strong>`
}),
type: types_1.TEXT_ENTITY_TYPES.BOLD
})
});
const italicTextEntityParser = (0, parser_1.createParser)({
name: 'italic-text-entity',
priority: 75,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.ITALIC,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => `*${entity.text}*`,
toHTML: (entity) => `<em>${escapeHtml(entity.text)}</em>`
}),
type: types_1.TEXT_ENTITY_TYPES.ITALIC
})
});
const underlineTextEntityParser = (0, parser_1.createParser)({
name: 'underline-text-entity',
priority: 70,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.UNDERLINE,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => `__${entity.text}__`,
toHTML: (entity) => `<u>${escapeHtml(entity.text)}</u>`
}),
type: types_1.TEXT_ENTITY_TYPES.UNDERLINE
})
});
const strikethroughTextEntityParser = (0, parser_1.createParser)({
name: 'strikethrough-text-entity',
priority: 65,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.STRIKETHROUGH,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => `~~${entity.text}~~`,
toHTML: (entity) => `<del>${escapeHtml(entity.text)}</del>`
}),
type: types_1.TEXT_ENTITY_TYPES.STRIKETHROUGH
})
});
const spoilerTextEntityParser = (0, parser_1.createParser)({
name: 'spoiler-text-entity',
priority: 60,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.SPOILER,
parse: (raw) => createBaseEntity(raw, {
toMarkdown: (entity) => `||${entity.text}||`,
toHTML: (entity) => `<span class="spoiler">${escapeHtml(entity.text)}</span>`
})
});
const codeTextEntityParser = (0, parser_1.createParser)({
name: 'code-text-entity',
priority: 85,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.CODE,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => `\`${entity.text}\``,
toHTML: (entity) => `<code>${escapeHtml(entity.text)}</code>`
}),
type: types_1.TEXT_ENTITY_TYPES.CODE,
language: raw.language
})
});
const preTextEntityParser = (0, parser_1.createParser)({
name: 'pre-text-entity',
priority: 90,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.PRE,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => {
const language = entity.language?.trim() ?? '';
return language
? `\`\`\`${language}\n${entity.text}\n\`\`\``
: `\`\`\`\n${entity.text}\n\`\`\``;
},
toHTML: (entity) => {
const language = entity.language?.trim() ?? '';
const className = language ? ` class="language-${language}"` : '';
return `<pre><code${className}>${escapeHtml(entity.text)}</code></pre>`;
}
}),
type: types_1.TEXT_ENTITY_TYPES.PRE,
language: raw.language
})
});
const customEmojiEntityParser = (0, parser_1.createParser)({
name: 'custom-emoji-entity',
priority: 55,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.CUSTOM_EMOJI,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => entity.document_id
? `${entity.text} <!-- custom emoji: ${entity.document_id} -->`
: entity.text,
toHTML: (entity) => entity.document_id
? `<span class="custom-emoji" data-document-id="${escapeHtml(entity.document_id)}">${escapeHtml(entity.text)}</span>`
: escapeHtml(entity.text)
}),
type: types_1.TEXT_ENTITY_TYPES.CUSTOM_EMOJI,
documentURL: (0, base_1.parseExportedFile)(raw.document_id)
})
});
const blockquoteEntityParser = (0, parser_1.createParser)({
name: 'blockquote-entity',
priority: 50,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.BLOCKQUOTE,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => {
const quoteLines = entity.text.split('\n');
const quotePrefix = entity.collapsed ? '**> ' : '> ';
return quoteLines
.map((line, index) => index === 0 ? `${quotePrefix}${line}` : `> ${line}`)
.join('\n');
},
toHTML: (entity) => {
const className = entity.collapsed ? 'blockquote collapsed' : 'blockquote';
return `<blockquote class="${className}">${escapeHtml(entity.text)}</blockquote>`;
}
}),
type: types_1.TEXT_ENTITY_TYPES.BLOCKQUOTE,
collapsed: raw.collapsed ?? false
})
});
const bankCardEntityParser = (0, parser_1.createParser)({
name: 'bank-card-entity',
priority: 45,
canHandle: (raw) => raw.type === types_1.TEXT_ENTITY_TYPES.BANK_CARD,
parse: (raw) => ({
...createBaseEntity(raw, {
toMarkdown: (entity) => {
const formattedCard = entity.text.replace(/(\d{4})(?=\d)/g, '$1 ');
return `\`${formattedCard}\``;
},
toHTML: (entity) => {
const formattedCard = entity.text.replace(/(\d{4})(?=\d)/g, '$1 ');
return `<code class="bank-card">${escapeHtml(formattedCard)}</code>`;
}
}),
type: types_1.TEXT_ENTITY_TYPES.BANK_CARD
})
});
// =====================================================
// LINK ENTITY PARSERS
// =====================================================
const linkConverter = {
toMarkdown: (entity) => {
const url = entity.href ?? getUrlForType(entity.type, entity.text);
return `[${entity.text}](${url})`;
},
toHTML: (entity) => {
const url = escapeHtml(entity.href ?? getUrlForType(entity.type, entity.text));
return `<a href="${url}">${escapeHtml(entity.text)}</a>`;
}
};
const createLinkEntityParser = (name, type, priority) => (0, parser_1.createParser)({
name,
priority,
canHandle: (raw) => raw.type === type,
parse: (raw) => ({
...createBaseEntity(raw, linkConverter),
type: raw.type,
url: raw.href ?? getUrlForType(raw.type, raw.text)
})
});
const textLinkEntityParser = createLinkEntityParser('text-link-entity', types_1.TEXT_ENTITY_TYPES.TEXT_LINK, 95);
const linkEntityParser = createLinkEntityParser('link-entity', types_1.TEXT_ENTITY_TYPES.LINK, 40);
const hashtagEntityParser = createLinkEntityParser('hashtag-entity', types_1.TEXT_ENTITY_TYPES.HASHTAG, 35);
const cashtagEntityParser = createLinkEntityParser('cashtag-entity', types_1.TEXT_ENTITY_TYPES.CASHTAG, 30);
const botCommandEntityParser = createLinkEntityParser('bot-command-entity', types_1.TEXT_ENTITY_TYPES.BOT_COMMAND, 25);
const emailEntityParser = createLinkEntityParser('email-entity', types_1.TEXT_ENTITY_TYPES.EMAIL, 20);
const phoneEntityParser = createLinkEntityParser('phone-entity', types_1.TEXT_ENTITY_TYPES.PHONE, 15);
// =====================================================
// MENTION ENTITY PARSERS
// =====================================================
const createMentionEntityParser = (name, type, priority) => (0, parser_1.createParser)({
name,
priority,
canHandle: (raw) => raw.type === type,
parse: (raw) => ({
...createBaseEntity(raw, linkConverter),
type: raw.type,
mention: (0, actors_1.parseUsernameMention)(raw.text, raw.user_id)
})
});
const mentionEntityParser = createMentionEntityParser('mention-entity', types_1.TEXT_ENTITY_TYPES.MENTION, 85);
const mentionNameEntityParser = createMentionEntityParser('mention-name-entity', types_1.TEXT_ENTITY_TYPES.MENTION_NAME, 80);
// =====================================================
// EXPORTS
// =====================================================
exports.ENTITY_PARSERS = [
// High priority - specific formatting
textLinkEntityParser,
preTextEntityParser,
mentionEntityParser,
mentionNameEntityParser,
codeTextEntityParser,
boldTextEntityParser,
italicTextEntityParser,
underlineTextEntityParser,
strikethroughTextEntityParser,
spoilerTextEntityParser,
customEmojiEntityParser,
blockquoteEntityParser,
bankCardEntityParser,
// Medium priority - links and commands
linkEntityParser,
hashtagEntityParser,
cashtagEntityParser,
botCommandEntityParser,
emailEntityParser,
phoneEntityParser,
// Lowest priority - fallback
plainTextEntityParser
];
// =====================================================
// UTILITY FUNCTIONS FOR PROCESSING
// =====================================================
const cleanupMarkdown = (text) => text
.replace(/[ \t]+/g, ' ')
.replace(/\n{4,}/g, '\n\n\n')
.replace(/`\s+/g, '` ')
.replace(/\s+`/g, ' `')
.replace(/\*\*\s+/g, '**')
.replace(/\s+\*\*/g, '**')
.replace(/\*\s+/g, '*')
.replace(/\s+\*/g, '*')
.split('\n')
.map(line => line.trim())
.join('\n')
.trim();
exports.cleanupMarkdown = cleanupMarkdown;
const processTextEntities = (entities) => ({
markdown: (0, exports.cleanupMarkdown)(entities.map(entity => entity.toMarkdown()).join('')),
html: entities.map(entity => entity.toHTML()).join('')
});
exports.processTextEntities = processTextEntities;
//# sourceMappingURL=text-entities.js.map