defuddle
Version:
Extract article content and metadata from web pages.
202 lines • 8.61 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.TwitterExtractor = void 0;
const _base_1 = require("./_base");
class TwitterExtractor extends _base_1.BaseExtractor {
constructor(document, url) {
super(document, url);
this.mainTweet = null;
this.threadTweets = [];
// Get all tweets from the timeline
const timeline = document.querySelector('[aria-label="Timeline: Conversation"]');
if (!timeline) {
// Try to find a single tweet if not in timeline view
const singleTweet = document.querySelector('article[data-testid="tweet"]');
if (singleTweet) {
this.mainTweet = singleTweet;
}
return;
}
// Get all tweets before any section with "Discover more" or similar headings
const allTweets = Array.from(timeline.querySelectorAll('article[data-testid="tweet"]'));
const firstSection = timeline.querySelector('section, h2')?.parentElement;
if (firstSection) {
// Filter out tweets that appear after the first section
allTweets.forEach((tweet, index) => {
if (firstSection.compareDocumentPosition(tweet) & Node.DOCUMENT_POSITION_FOLLOWING) {
allTweets.splice(index);
return false;
}
});
}
// Set main tweet and thread tweets
this.mainTweet = allTweets[0] || null;
this.threadTweets = allTweets.slice(1);
}
canExtract() {
return !!this.mainTweet;
}
extract() {
const mainContent = this.extractTweet(this.mainTweet);
const threadContent = this.threadTweets.map(tweet => this.extractTweet(tweet)).join('\n<hr>\n');
const contentHtml = `
<div class="tweet-thread">
<div class="main-tweet">
${mainContent}
</div>
${threadContent ? `
<hr>
<div class="thread-tweets">
${threadContent}
</div>
` : ''}
</div>
`.trim();
const tweetId = this.getTweetId();
const tweetAuthor = this.getTweetAuthor();
const description = this.createDescription(this.mainTweet);
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
tweetId,
tweetAuthor,
},
variables: {
title: `Thread by ${tweetAuthor}`,
author: tweetAuthor,
site: 'X (Twitter)',
description,
}
};
}
formatTweetText(text) {
if (!text)
return '';
// Create a temporary div to parse and clean the HTML
const tempDiv = document.createElement('div');
tempDiv.innerHTML = text;
// Convert links to plain text with @ handles
tempDiv.querySelectorAll('a').forEach(link => {
const handle = link.textContent?.trim() || '';
link.replaceWith(handle);
});
// Remove unnecessary spans and divs but keep their content
tempDiv.querySelectorAll('span, div').forEach(element => {
element.replaceWith(...Array.from(element.childNodes));
});
// Get cleaned text and split into paragraphs
const cleanText = tempDiv.innerHTML;
const paragraphs = cleanText.split('\n')
.map(line => line.trim())
.filter(line => line);
// Wrap each paragraph in <p> tags
return paragraphs.map(p => `<p>${p}</p>`).join('\n');
}
extractTweet(tweet) {
if (!tweet)
return '';
// Clone the tweet element to modify it
const tweetClone = tweet.cloneNode(true);
// Convert emoji images to text
tweetClone.querySelectorAll('img[src*="/emoji/"]').forEach(img => {
if (img.tagName.toLowerCase() === 'img' && img.getAttribute('alt')) {
const altText = img.getAttribute('alt');
if (altText) {
img.replaceWith(altText);
}
}
});
const tweetText = tweetClone.querySelector('[data-testid="tweetText"]')?.innerHTML || '';
const formattedText = this.formatTweetText(tweetText);
const images = this.extractImages(tweet);
// Get author info and date
const userInfo = this.extractUserInfo(tweet);
// Extract quoted tweet if present
const quotedTweet = tweet.querySelector('[aria-labelledby*="id__"]')?.querySelector('[data-testid="User-Name"]')?.closest('[aria-labelledby*="id__"]');
const quotedContent = quotedTweet ? this.extractTweet(quotedTweet) : '';
return `
<div class="tweet">
<div class="tweet-header">
<span class="tweet-author"><strong>${userInfo.fullName}</strong> <span class="tweet-handle">${userInfo.handle}</span></span>
${userInfo.date ? `<a href="${userInfo.permalink}" class="tweet-date">${userInfo.date}</a>` : ''}
</div>
${formattedText ? `<div class="tweet-text">${formattedText}</div>` : ''}
${images.length ? `
<div class="tweet-media">
${images.join('\n')}
</div>
` : ''}
${quotedContent ? `
<blockquote class="quoted-tweet">
${quotedContent}
</blockquote>
` : ''}
</div>
`.trim();
}
extractUserInfo(tweet) {
const nameElement = tweet.querySelector('[data-testid="User-Name"]');
if (!nameElement)
return { fullName: '', handle: '', date: '', permalink: '' };
// Try to get name and handle from links first (main tweet structure)
const links = nameElement.querySelectorAll('a');
let fullName = links?.[0]?.textContent?.trim() || '';
let handle = links?.[1]?.textContent?.trim() || '';
// If links don't have the info, try to get from spans (quoted tweet structure)
if (!fullName || !handle) {
fullName = nameElement.querySelector('span[style*="color: rgb(15, 20, 25)"] span')?.textContent?.trim() || '';
handle = nameElement.querySelector('span[style*="color: rgb(83, 100, 113)"]')?.textContent?.trim() || '';
}
const timestamp = tweet.querySelector('time');
const datetime = timestamp?.getAttribute('datetime') || '';
const date = datetime ? new Date(datetime).toISOString().split('T')[0] : '';
const permalink = timestamp?.closest('a')?.href || '';
return { fullName, handle, date, permalink };
}
extractImages(tweet) {
// Look for images in different containers
const imageContainers = [
'[data-testid="tweetPhoto"]',
'[data-testid="tweet-image"]',
'img[src*="media"]'
];
const images = [];
// Skip images that are inside quoted tweets
const quotedTweet = tweet.querySelector('[aria-labelledby*="id__"]')?.querySelector('[data-testid="User-Name"]')?.closest('[aria-labelledby*="id__"]');
for (const selector of imageContainers) {
const elements = tweet.querySelectorAll(selector);
elements.forEach(img => {
// Skip if the image is inside a quoted tweet
if (quotedTweet?.contains(img)) {
return;
}
// Check if element is an image by checking tag name and required properties
if (img.tagName.toLowerCase() === 'img' && img.getAttribute('alt')) {
const highQualitySrc = img.getAttribute('src')?.replace(/&name=\w+$/, '&name=large') || '';
const cleanAlt = img.getAttribute('alt')?.replace(/\s+/g, ' ').trim() || '';
images.push(`<img src="${highQualitySrc}" alt="${cleanAlt}" />`);
}
});
}
return images;
}
getTweetId() {
const match = this.url.match(/status\/(\d+)/);
return match?.[1] || '';
}
getTweetAuthor() {
const nameElement = this.mainTweet?.querySelector('[data-testid="User-Name"]');
const links = nameElement?.querySelectorAll('a');
const handle = links?.[1]?.textContent?.trim() || '';
return handle.startsWith('@') ? handle : `@${handle}`;
}
createDescription(tweet) {
if (!tweet)
return '';
const tweetText = tweet.querySelector('[data-testid="tweetText"]')?.textContent || '';
return tweetText.trim().slice(0, 140).replace(/\s+/g, ' ');
}
}
exports.TwitterExtractor = TwitterExtractor;
//# sourceMappingURL=twitter.js.map
;