defuddle
Version:
Extract article content and metadata from web pages.
350 lines • 14.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.XOembedExtractor = void 0;
const _base_1 = require("./_base");
const dom_1 = require("../utils/dom");
class XOembedExtractor extends _base_1.BaseExtractor {
canExtract() {
return false;
}
extract() {
return {
content: '',
contentHtml: '',
};
}
canExtractAsync() {
return /\/(status|article)\/\d+/.test(this.url);
}
async extractAsync() {
// Try FxTwitter first — it has full tweet text and media
const fxResult = await this.tryExtractFxTwitter();
if (fxResult) {
return fxResult;
}
// Fall back to oEmbed (truncates long tweets but always available)
return this.extractOembed();
}
async extractOembed() {
const oembedUrl = `https://publish.twitter.com/oembed?url=${encodeURIComponent(this.url)}&omit_script=true`;
const response = await fetch(oembedUrl);
if (!response.ok) {
throw new Error(`oEmbed request failed: ${response.status}`);
}
const data = await response.json();
// Parse the oEmbed HTML to extract tweet text
const div = this.document.createElement('div');
div.appendChild((0, dom_1.parseHTML)(this.document, data.html));
// The oEmbed HTML contains a <blockquote> with <p> tags for text
// and an <a> tag for the date
const blockquote = div.querySelector('blockquote');
const paragraphs = blockquote?.querySelectorAll('p') || [];
const tweetText = Array.from(paragraphs)
.map(p => `<p>${(0, dom_1.serializeHTML)(p)}</p>`)
.join('\n');
const handle = data.author_url
? `@${data.author_url.split('/').pop()}`
: '';
const dateLink = blockquote?.querySelector('a:last-child');
const dateText = dateLink?.textContent?.trim() || '';
const permalink = dateLink?.getAttribute('href') || this.url;
const escapedAuthorName = (0, dom_1.escapeHtml)(data.author_name);
const escapedHandle = (0, dom_1.escapeHtml)(handle);
const escapedDateText = (0, dom_1.escapeHtml)(dateText);
const escapedPermalink = (0, dom_1.escapeHtml)(permalink);
const contentHtml = `
<div class="tweet-thread">
<div class="main-tweet">
<div class="tweet">
<div class="tweet-header">
<span class="tweet-author"><strong>${escapedAuthorName}</strong> <span class="tweet-handle">${escapedHandle}</span></span>
${dateText ? `<a href="${escapedPermalink}" class="tweet-date">${escapedDateText}</a>` : ''}
</div>
${tweetText ? `<div class="tweet-text">${tweetText}</div>` : ''}
</div>
</div>
</div>
`.trim();
return {
content: contentHtml,
contentHtml: contentHtml,
variables: {
title: `Post by ${handle || data.author_name}`,
author: handle || data.author_name,
site: 'X (Twitter)',
}
};
}
async tryExtractFxTwitter() {
const match = this.url.match(/\/([a-zA-Z][a-zA-Z0-9_]{0,14})\/(status|article)\/(\d+)/);
if (!match)
return null;
try {
const data = await this.fetchFxTwitter(match[1], match[3]);
// If it's an article, use the rich article renderer
if (data.tweet?.article) {
return this.buildArticleResult(data);
}
// Otherwise use the full tweet text from FxTwitter
if (data.tweet?.text) {
return this.buildTweetResult(data);
}
return null;
}
catch {
return null;
}
}
async fetchFxTwitter(username, id) {
const apiUrl = `https://api.fxtwitter.com/${username}/status/${id}`;
const response = await fetch(apiUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Defuddle/1.0; +https://defuddle.md)',
},
});
if (!response.ok) {
throw new Error(`FxTwitter API request failed: ${response.status}`);
}
return response.json();
}
buildArticleResult(data) {
const article = data.tweet.article;
const { blocks, entityMap } = article.content;
const contentHtml = this.renderArticle(blocks, entityMap, article.cover_media);
const handle = `@${data.tweet.author.screen_name}`;
return {
content: contentHtml,
contentHtml,
variables: {
title: article.title,
author: handle,
site: 'X (Twitter)',
description: article.preview_text,
}
};
}
buildTweetResult(data) {
const tweet = data.tweet;
const handle = `@${tweet.author.screen_name}`;
const contentHtml = this.renderTweet(tweet);
return {
content: contentHtml,
contentHtml,
variables: {
title: `Post by ${handle}`,
author: handle,
site: 'X (Twitter)',
}
};
}
renderTweet(tweet) {
const text = tweet.raw_text?.text || tweet.text;
// Filter out media facets — FxTwitter already strips pic.twitter.com
// links from the text, so media facet indices are stale
const facets = (tweet.raw_text?.facets || []).filter(f => f.type !== 'media');
// Split text into paragraphs on double newlines
const paragraphs = text.split(/\n\n+/);
let offset = 0;
const htmlParts = [];
for (const para of paragraphs) {
const paraStart = text.indexOf(para, offset);
const paraEnd = paraStart + para.length;
offset = paraEnd;
// Check if this paragraph is a blockquote (starts with >)
const isBlockquote = para.trimStart().startsWith('>');
let paraText = isBlockquote ? para.trimStart().slice(1).trimStart() : para;
const paraTextStart = isBlockquote
? paraStart + (para.length - para.trimStart().length) + 1 + (para.trimStart().slice(1).length - para.trimStart().slice(1).trimStart().length)
: paraStart;
// Apply facets within this paragraph
const rendered = this.applyFacets(paraText, paraTextStart, paraEnd, facets);
// Handle line breaks within paragraph
const withBreaks = rendered.replace(/\n/g, '<br>');
if (isBlockquote) {
htmlParts.push(`<blockquote><p>${withBreaks}</p></blockquote>`);
}
else if (withBreaks.trim()) {
htmlParts.push(`<p>${withBreaks}</p>`);
}
}
// Append media images
if (tweet.media?.photos) {
for (const photo of tweet.media.photos) {
htmlParts.push(`<img src="${(0, dom_1.escapeHtml)(photo.url)}" alt="">`);
}
}
const handle = (0, dom_1.escapeHtml)(`@${tweet.author.screen_name}`);
const authorName = (0, dom_1.escapeHtml)(tweet.author.name);
return `<div class="tweet-thread"><div class="main-tweet"><div class="tweet">` +
`<div class="tweet-header"><span class="tweet-author"><strong>${authorName}</strong> <span class="tweet-handle">${handle}</span></span></div>` +
`<div class="tweet-text">${htmlParts.join('\n')}</div>` +
`</div></div></div>`;
}
applyMarkers(text, markers) {
if (markers.length === 0) {
return (0, dom_1.escapeHtml)(text);
}
markers.sort((a, b) => {
if (a.offset !== b.offset)
return a.offset - b.offset;
if (a.type === 'close' && b.type === 'open')
return -1;
if (a.type === 'open' && b.type === 'close')
return 1;
return 0;
});
let result = '';
let pos = 0;
for (const marker of markers) {
if (marker.offset > pos) {
result += (0, dom_1.escapeHtml)(text.slice(pos, marker.offset));
}
result += marker.tag;
pos = marker.offset;
}
if (pos < text.length) {
result += (0, dom_1.escapeHtml)(text.slice(pos));
}
return result;
}
applyFacets(text, textStart, textEnd, facets) {
const markers = [];
for (const facet of facets) {
const [fStart, fEnd] = facet.indices;
if (fEnd <= textStart || fStart >= textEnd)
continue;
const relStart = Math.max(0, fStart - textStart);
const relEnd = Math.min(text.length, fEnd - textStart);
if (facet.type === 'italic') {
markers.push({ offset: relStart, type: 'open', tag: '<em>' });
markers.push({ offset: relEnd, type: 'close', tag: '</em>' });
}
else if (facet.type === 'mention' && facet.text) {
const url = `https://x.com/${(0, dom_1.escapeHtml)(facet.text)}`;
markers.push({ offset: relStart, type: 'open', tag: `<a href="${url}">` });
markers.push({ offset: relEnd, type: 'close', tag: '</a>' });
}
else if (facet.type === 'url' && facet.original) {
const url = (0, dom_1.escapeHtml)(facet.original);
markers.push({ offset: relStart, type: 'open', tag: `<a href="${url}">` });
markers.push({ offset: relEnd, type: 'close', tag: '</a>' });
}
}
return this.applyMarkers(text, markers);
}
renderArticle(blocks, entityMap, coverMedia) {
const parts = [];
// Add cover image if available
if (coverMedia?.media_info?.original_img_url) {
parts.push(`<img src="${(0, dom_1.escapeHtml)(coverMedia.media_info.original_img_url)}" alt="Cover image">`);
}
let i = 0;
while (i < blocks.length) {
const block = blocks[i];
if (block.type === 'unordered-list-item') {
// Group consecutive list items into a <ul>
const items = [];
while (i < blocks.length && blocks[i].type === 'unordered-list-item') {
items.push(`<li>${this.renderInlineContent(blocks[i], entityMap)}</li>`);
i++;
}
parts.push(`<ul>${items.join('')}</ul>`);
continue;
}
const html = this.renderBlock(block, entityMap);
if (html) {
parts.push(html);
}
i++;
}
return `<article class="x-article">${parts.join('')}</article>`;
}
renderBlock(block, entityMap) {
switch (block.type) {
case 'unstyled': {
if (!block.text.trim())
return '';
return `<p>${this.renderInlineContent(block, entityMap)}</p>`;
}
case 'header-two':
return `<h2>${this.renderInlineContent(block, entityMap)}</h2>`;
case 'header-three':
return `<h3>${this.renderInlineContent(block, entityMap)}</h3>`;
case 'atomic':
return this.renderAtomicBlock(block, entityMap);
default: {
if (!block.text.trim())
return '';
return `<p>${this.renderInlineContent(block, entityMap)}</p>`;
}
}
}
renderAtomicBlock(block, entityMap) {
if (block.entityRanges.length === 0)
return '';
const entityEntry = entityMap.find(e => e.key === String(block.entityRanges[0].key));
if (!entityEntry)
return '';
const entity = entityEntry.value;
switch (entity.type) {
case 'MEDIA': {
const caption = entity.data.caption;
if (caption) {
return `<figure><figcaption>${(0, dom_1.escapeHtml)(caption)}</figcaption></figure>`;
}
return '';
}
case 'MARKDOWN': {
const markdown = entity.data.markdown || '';
// Strip the wrapping ```...``` fences
const codeMatch = markdown.match(/^```(\w*)\n([\s\S]*?)\n?```$/);
if (codeMatch) {
const lang = codeMatch[1];
const code = codeMatch[2];
const langAttr = lang ? ` class="language-${(0, dom_1.escapeHtml)(lang)}" data-lang="${(0, dom_1.escapeHtml)(lang)}"` : '';
return `<pre><code${langAttr}>${(0, dom_1.escapeHtml)(code)}</code></pre>`;
}
return `<pre><code>${(0, dom_1.escapeHtml)(markdown)}</code></pre>`;
}
default:
return '';
}
}
renderInlineContent(block, entityMap) {
const text = block.text;
if (!text)
return '';
const markers = [];
for (const range of block.inlineStyleRanges) {
if (range.style === 'Bold') {
markers.push({ offset: range.offset, type: 'open', tag: '<strong>' });
markers.push({ offset: range.offset + range.length, type: 'close', tag: '</strong>' });
}
}
for (const range of block.entityRanges) {
const entityEntry = entityMap.find(e => e.key === String(range.key));
if (entityEntry?.value.type === 'LINK' && entityEntry.value.data.url) {
const url = (0, dom_1.escapeHtml)(entityEntry.value.data.url);
markers.push({ offset: range.offset, type: 'open', tag: `<a href="${url}">` });
markers.push({ offset: range.offset + range.length, type: 'close', tag: '</a>' });
}
}
if (block.data?.mentions) {
for (const mention of block.data.mentions) {
const url = `https://x.com/${(0, dom_1.escapeHtml)(mention.text)}`;
markers.push({ offset: mention.fromIndex, type: 'open', tag: `<a href="${url}">` });
markers.push({ offset: mention.toIndex, type: 'close', tag: '</a>' });
}
}
if (block.data?.urls) {
for (const urlData of block.data.urls) {
const url = (0, dom_1.escapeHtml)(urlData.text);
markers.push({ offset: urlData.fromIndex, type: 'open', tag: `<a href="${url}">` });
markers.push({ offset: urlData.toIndex, type: 'close', tag: '</a>' });
}
}
return this.applyMarkers(text, markers);
}
}
exports.XOembedExtractor = XOembedExtractor;
//# sourceMappingURL=x-oembed.js.map