defuddle
Version:
Extract article content and metadata from web pages.
194 lines • 8.44 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.RedditExtractor = void 0;
const _base_1 = require("./_base");
const dom_1 = require("../utils/dom");
const comments_1 = require("../utils/comments");
class RedditExtractor extends _base_1.BaseExtractor {
constructor(document, url) {
super(document, url);
this.shredditPost = document.querySelector('shreddit-post');
this.isOldReddit = !!document.querySelector('.thing.link');
}
canExtract() {
return !!this.shredditPost || this.isOldReddit;
}
canExtractAsync() {
// For new reddit comment pages, extract() returns empty content
// when shreddit-comment elements are missing (server-side fetch),
// causing parseAsync() to fall through to this async path.
return this.isCommentsPage() && !this.isOldReddit;
}
isCommentsPage() {
return /\/r\/.+\/comments\//.test(this.url);
}
async extractAsync() {
// Convert URL to old.reddit.com
const oldUrl = new URL(this.url);
oldUrl.hostname = 'old.reddit.com';
const response = await fetch(oldUrl.toString(), {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Defuddle/1.0)',
},
});
if (!response.ok) {
throw new Error(`Failed to fetch old.reddit.com: ${response.status}`);
}
const html = await response.text();
const Parser = this.document.defaultView?.DOMParser ?? (typeof DOMParser !== 'undefined' ? DOMParser : null);
if (!Parser) {
throw new Error('DOMParser is not available in this environment');
}
const doc = new Parser().parseFromString(html, 'text/html');
return this.extractOldReddit(doc);
}
extract() {
if (this.isOldReddit) {
return this.extractOldReddit(this.document);
}
// New reddit server-side HTML includes shreddit-post but not
// shreddit-comment elements (those require JS). Return empty
// so parseAsync() falls through to extractAsync() which fetches
// old.reddit.com with full content.
const hasComments = this.document.querySelectorAll('shreddit-comment').length > 0;
if (this.isCommentsPage() && !hasComments) {
return { content: '', contentHtml: '' };
}
const postContent = this.getPostContent();
const comments = this.extractComments();
const contentHtml = this.createContentHtml(postContent, comments);
const postTitle = this.document.querySelector('h1')?.textContent?.trim() || '';
const subreddit = this.getSubreddit();
const postAuthor = this.getPostAuthor();
const description = this.createDescription(postContent);
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
postId: this.getPostId(),
subreddit,
postAuthor,
},
variables: {
title: postTitle,
author: postAuthor,
site: `r/${subreddit}`,
description,
}
};
}
extractOldReddit(root) {
const thingLink = root.querySelector('.thing.link');
const postTitle = thingLink?.querySelector('a.title')?.textContent?.trim() || '';
const postAuthor = thingLink?.getAttribute('data-author') || '';
const subreddit = thingLink?.getAttribute('data-subreddit') || '';
const postBodyEl = thingLink?.querySelector('.usertext-body .md');
const postBody = postBodyEl ? (0, dom_1.serializeHTML)(postBodyEl) : '';
const commentArea = root.querySelector('.commentarea .sitetable');
const commentData = commentArea ? this.collectOldRedditComments(commentArea) : [];
const comments = commentData.length > 0 ? (0, comments_1.buildCommentTree)(commentData) : '';
const contentHtml = this.createContentHtml(postBody, comments);
const description = this.createDescription(postBody);
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
postId: this.getPostId(),
subreddit,
postAuthor,
},
variables: {
title: postTitle,
author: postAuthor,
site: `r/${subreddit}`,
description,
}
};
}
getPostContent() {
const textBodyEl = this.shredditPost?.querySelector('[slot="text-body"]');
const textBody = textBodyEl ? (0, dom_1.serializeHTML)(textBodyEl) : '';
const mediaBody = this.shredditPost?.querySelector('#post-image')?.outerHTML || '';
return textBody + mediaBody;
}
createContentHtml(postContent, comments) {
return (0, comments_1.buildContentHtml)('reddit', postContent, comments);
}
extractComments() {
const comments = Array.from(this.document.querySelectorAll('shreddit-comment'));
return this.processComments(comments);
}
getPostId() {
const match = this.url.match(/comments\/([a-zA-Z0-9]+)/);
return match?.[1] || '';
}
getSubreddit() {
const match = this.url.match(/\/r\/([^/]+)/);
return match?.[1] || '';
}
getPostAuthor() {
return this.shredditPost?.getAttribute('author') || '';
}
createDescription(postContent) {
if (!postContent)
return '';
const tempDiv = this.document.createElement('div');
tempDiv.appendChild((0, dom_1.parseHTML)(this.document, postContent));
return tempDiv.textContent?.trim()
.slice(0, 140)
.replace(/\s+/g, ' ') || '';
}
collectOldRedditComments(container, depth = 0) {
const result = [];
const comments = Array.from(container.querySelectorAll(':scope > .thing.comment'));
for (const comment of comments) {
const author = comment.getAttribute('data-author') || '';
const permalink = comment.getAttribute('data-permalink') || '';
const score = comment.querySelector('.entry .tagline .score.unvoted')?.textContent?.trim() || '';
const timeEl = comment.querySelector('.entry .tagline time[datetime]');
const datetime = timeEl?.getAttribute('datetime') || '';
const date = datetime ? new Date(datetime).toISOString().split('T')[0] : '';
const bodyEl = comment.querySelector('.entry .usertext-body .md');
const body = bodyEl ? (0, dom_1.serializeHTML)(bodyEl) : '';
result.push({
author,
date,
content: body,
depth,
score: score || undefined,
url: permalink ? `https://reddit.com${permalink}` : undefined,
});
const childContainer = comment.querySelector('.child > .sitetable');
if (childContainer) {
result.push(...this.collectOldRedditComments(childContainer, depth + 1));
}
}
return result;
}
processComments(comments) {
const commentData = [];
for (const comment of comments) {
const depth = parseInt(comment.getAttribute('depth') || '0');
const author = comment.getAttribute('author') || '';
const score = comment.getAttribute('score') || '0';
const permalink = comment.getAttribute('permalink') || '';
const commentEl = comment.querySelector('[slot="comment"]');
const content = commentEl ? (0, dom_1.serializeHTML)(commentEl) : '';
const timestamp = comment.getAttribute('created')
|| comment.querySelector('time')?.getAttribute('datetime')
|| '';
const date = timestamp ? new Date(timestamp).toISOString().split('T')[0] : '';
commentData.push({
author,
date,
content,
depth,
score: `${score} points`,
url: permalink ? `https://reddit.com${permalink}` : undefined,
});
}
return (0, comments_1.buildCommentTree)(commentData);
}
}
exports.RedditExtractor = RedditExtractor;
//# sourceMappingURL=reddit.js.map