defuddle
Version:
Extract article content and metadata from web pages.
267 lines • 11.7 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.XArticleExtractor = void 0;
const _base_1 = require("./_base");
const dom_1 = require("../utils/dom");
const SELECTORS = {
ARTICLE_CONTAINER: '[data-testid="twitterArticleRichTextView"]',
TITLE: '[data-testid="twitter-article-title"]',
AUTHOR: '[itemprop="author"]',
AUTHOR_NAME: 'meta[itemprop="name"]',
AUTHOR_HANDLE: 'meta[itemprop="additionalName"]',
IMAGES: '[data-testid="tweetPhoto"] img',
DRAFT_PARAGRAPHS: '.longform-unstyled, .public-DraftStyleDefault-block',
BOLD_SPANS: 'span[style*="font-weight: bold"]',
DRAFT_ATTRIBUTES: '[data-offset-key]',
EMBEDDED_TWEET: '[data-testid="simpleTweet"]',
TWEET_TEXT: '[data-testid="tweetText"]',
USER_NAME: '[data-testid="User-Name"]',
CODE_BLOCK: '[data-testid="markdown-code-block"]',
HEADER_BLOCK: '[data-testid="longform-header"]',
};
class XArticleExtractor extends _base_1.BaseExtractor {
constructor(document, url, schemaOrgData) {
super(document, url, schemaOrgData);
this.articleContainer = document.querySelector(SELECTORS.ARTICLE_CONTAINER);
}
canExtract() {
return !!this.articleContainer;
}
extract() {
const title = this.extractTitle();
const author = this.extractAuthor();
const contentHtml = this.extractContent();
const description = this.createDescription();
return {
content: contentHtml,
contentHtml,
extractedContent: {
articleId: this.getArticleId(),
},
variables: {
title,
author,
site: 'X (Twitter)',
description,
}
};
}
extractTitle() {
const titleEl = this.document.querySelector(SELECTORS.TITLE);
return titleEl?.textContent?.trim() || 'Untitled X Article';
}
extractAuthor() {
const authorContainer = this.document.querySelector(SELECTORS.AUTHOR);
if (!authorContainer)
return this.getAuthorFromUrl();
const name = authorContainer.querySelector(SELECTORS.AUTHOR_NAME)?.getAttribute('content');
const handle = authorContainer.querySelector(SELECTORS.AUTHOR_HANDLE)?.getAttribute('content');
if (name && handle)
return `${name} (@${handle})`;
return name || handle || this.getAuthorFromUrl();
}
getAuthorFromUrl() {
// match username before /article/, excluding system paths like /i/
const match = this.url.match(/\/([a-zA-Z][a-zA-Z0-9_]{0,14})\/article\/\d+/);
return match ? `@${match[1]}` : this.getAuthorFromOgTitle();
}
getAuthorFromOgTitle() {
const ogTitle = this.document.querySelector('meta[property="og:title"]')?.getAttribute('content') || '';
// Match patterns like "(4) Heinrich on X: ..." or "Heinrich on X: ..."
const match = ogTitle.match(/^(?:\(\d+\)\s+)?(.+?)\s+on\s+X\s*:/);
return match ? match[1].trim() : 'Unknown';
}
getArticleId() {
const match = this.url.match(/article\/(\d+)/);
return match ? match[1] : '';
}
extractContent() {
if (!this.articleContainer)
return '';
const clone = this.articleContainer.cloneNode(true);
this.cleanContent(clone);
return `<article class="x-article">${(0, dom_1.serializeHTML)(clone)}</article>`;
}
cleanContent(container) {
const ownerDoc = container.ownerDocument || this.document;
// convert complex elements first (before other transformations)
this.convertEmbeddedTweets(container, ownerDoc);
this.convertCodeBlocks(container, ownerDoc);
this.convertHeaders(container, ownerDoc);
this.unwrapLinkedImages(container, ownerDoc);
this.upgradeImageQuality(container);
// convert bold spans BEFORE paragraphs so formatting is preserved
this.convertBoldSpans(container, ownerDoc);
this.convertDraftParagraphs(container, ownerDoc);
this.removeDraftAttributes(container);
}
convertEmbeddedTweets(container, ownerDoc) {
container.querySelectorAll(SELECTORS.EMBEDDED_TWEET).forEach(tweet => {
const blockquote = ownerDoc.createElement('blockquote');
blockquote.className = 'embedded-tweet';
// extract author info
const userNameEl = tweet.querySelector(SELECTORS.USER_NAME);
const authorLinks = userNameEl?.querySelectorAll('a');
const fullName = authorLinks?.[0]?.textContent?.trim() || '';
const handle = authorLinks?.[1]?.textContent?.trim() || '';
// extract tweet text
const tweetTextEl = tweet.querySelector(SELECTORS.TWEET_TEXT);
const tweetText = tweetTextEl?.textContent?.trim() || '';
// build clean blockquote content
if (fullName || handle) {
const cite = ownerDoc.createElement('cite');
cite.textContent = handle ? `${fullName} ${handle}` : fullName;
blockquote.appendChild(cite);
}
if (tweetText) {
const p = ownerDoc.createElement('p');
p.textContent = tweetText;
blockquote.appendChild(p);
}
tweet.replaceWith(blockquote);
});
}
convertCodeBlocks(container, ownerDoc) {
container.querySelectorAll(SELECTORS.CODE_BLOCK).forEach(block => {
const pre = block.querySelector('pre');
const code = block.querySelector('code');
if (!pre || !code)
return;
// extract language from class (e.g., "language-bash") or from span
let language = '';
const langClass = code.className.match(/language-(\w+)/);
if (langClass) {
language = langClass[1];
}
else {
// fallback: look for language label in the block header
const langSpan = block.querySelector('span');
language = langSpan?.textContent?.trim() || '';
}
// create clean pre/code structure
const newPre = ownerDoc.createElement('pre');
const newCode = ownerDoc.createElement('code');
if (language) {
newCode.setAttribute('data-lang', language);
newCode.className = `language-${language}`;
}
newCode.textContent = code.textContent || '';
newPre.appendChild(newCode);
// replace the entire block container
block.replaceWith(newPre);
});
}
convertHeaders(container, ownerDoc) {
// X articles use h2/h3 elements but content may be nested in spans/divs
container.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach(header => {
const level = header.tagName.toLowerCase();
const text = header.textContent?.trim() || '';
if (!text)
return;
const newHeader = ownerDoc.createElement(level);
newHeader.textContent = text;
header.replaceWith(newHeader);
});
}
unwrapLinkedImages(container, ownerDoc) {
// find all tweetPhoto images and extract them from any ancestor anchors
container.querySelectorAll(SELECTORS.IMAGES).forEach(img => {
// find closest anchor ancestor
const anchor = img.closest('a');
if (!anchor || !container.contains(anchor))
return;
// create clean img tag with upgraded quality (like TwitterExtractor does)
let src = img.getAttribute('src') || '';
const alt = img.getAttribute('alt')?.replace(/\s+/g, ' ').trim() || 'Image';
// upgrade image quality
if (src.includes('&name=')) {
src = src.replace(/&name=\w+/, '&name=large');
}
else if (src.includes('?')) {
src = `${src}&name=large`;
}
else {
src = `${src}?name=large`;
}
const cleanImg = ownerDoc.createElement('img');
cleanImg.setAttribute('src', src);
cleanImg.setAttribute('alt', alt);
// replace anchor with clean image
anchor.replaceWith(cleanImg);
});
}
upgradeImageQuality(container) {
container.querySelectorAll(SELECTORS.IMAGES).forEach(img => {
const src = img.getAttribute('src');
if (!src)
return;
if (src.includes('&name=')) {
img.setAttribute('src', src.replace(/&name=\w+/, '&name=large'));
}
else if (src.includes('?')) {
img.setAttribute('src', `${src}&name=large`);
}
else {
img.setAttribute('src', `${src}?name=large`);
}
});
}
convertDraftParagraphs(container, ownerDoc) {
// node type constants (avoid using Node global which isn't available in all environments)
const TEXT_NODE = 3;
const ELEMENT_NODE = 1;
container.querySelectorAll(SELECTORS.DRAFT_PARAGRAPHS).forEach(div => {
const p = ownerDoc.createElement('p');
// preserve formatting (strong, links, code) by processing children
const processNode = (node) => {
if (node.nodeType === TEXT_NODE) {
p.appendChild(ownerDoc.createTextNode(node.textContent || ''));
}
else if (node.nodeType === ELEMENT_NODE) {
const el = node;
const tag = el.tagName.toLowerCase();
if (tag === 'strong') {
const strong = ownerDoc.createElement('strong');
strong.textContent = el.textContent || '';
p.appendChild(strong);
}
else if (tag === 'a') {
const link = ownerDoc.createElement('a');
link.setAttribute('href', el.getAttribute('href') || '');
link.textContent = el.textContent || '';
p.appendChild(link);
}
else if (tag === 'code') {
const code = ownerDoc.createElement('code');
code.textContent = el.textContent || '';
p.appendChild(code);
}
else {
// recurse into other elements (spans, divs, etc.)
el.childNodes.forEach(child => processNode(child));
}
}
};
div.childNodes.forEach(child => processNode(child));
div.replaceWith(p);
});
}
convertBoldSpans(container, ownerDoc) {
container.querySelectorAll(SELECTORS.BOLD_SPANS).forEach(span => {
const strong = ownerDoc.createElement('strong');
strong.textContent = span.textContent || '';
span.replaceWith(strong);
});
}
removeDraftAttributes(container) {
container.querySelectorAll(SELECTORS.DRAFT_ATTRIBUTES).forEach(el => {
el.removeAttribute('data-offset-key');
});
}
createDescription() {
const text = this.articleContainer?.textContent?.trim() || '';
return text.slice(0, 140) + (text.length > 140 ? '...' : '');
}
}
exports.XArticleExtractor = XArticleExtractor;
//# sourceMappingURL=x-article.js.map