UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

143 lines 5.37 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RedditExtractor = void 0; const _base_1 = require("./_base"); class RedditExtractor extends _base_1.BaseExtractor { constructor(document, url) { super(document, url); this.shredditPost = document.querySelector('shreddit-post'); } canExtract() { return !!this.shredditPost; } extract() { const postContent = this.getPostContent(); const comments = this.extractComments(); const contentHtml = this.createContentHtml(postContent, comments); const postTitle = this.document.querySelector('h1')?.textContent?.trim() || ''; const subreddit = this.getSubreddit(); const postAuthor = this.getPostAuthor(); const description = this.createDescription(postContent); return { content: contentHtml, contentHtml: contentHtml, extractedContent: { postId: this.getPostId(), subreddit, postAuthor, }, variables: { title: postTitle, author: postAuthor, site: `r/${subreddit}`, description, } }; } getPostContent() { const textBody = this.shredditPost?.querySelector('[slot="text-body"]')?.innerHTML || ''; const mediaBody = this.shredditPost?.querySelector('#post-image')?.outerHTML || ''; return textBody + mediaBody; } createContentHtml(postContent, comments) { return ` <div class="reddit-post"> <div class="post-content"> ${postContent} </div> </div> ${comments ? ` <hr> <h2>Comments</h2> <div class="reddit-comments"> ${comments} </div> ` : ''} `.trim(); } extractComments() { const comments = Array.from(this.document.querySelectorAll('shreddit-comment')); return this.processComments(comments); } getPostId() { const match = this.url.match(/comments\/([a-zA-Z0-9]+)/); return match?.[1] || ''; } getSubreddit() { const match = this.url.match(/\/r\/([^/]+)/); return match?.[1] || ''; } getPostAuthor() { return this.shredditPost?.getAttribute('author') || ''; } createDescription(postContent) { if (!postContent) return ''; const tempDiv = document.createElement('div'); tempDiv.innerHTML = postContent; return tempDiv.textContent?.trim() .slice(0, 140) .replace(/\s+/g, ' ') || ''; } processComments(comments) { let html = ''; let currentDepth = -1; let blockquoteStack = []; // Keep track of open blockquotes at each depth for (const comment of comments) { const depth = parseInt(comment.getAttribute('depth') || '0'); const author = comment.getAttribute('author') || ''; const score = comment.getAttribute('score') || '0'; const permalink = comment.getAttribute('permalink') || ''; const content = comment.querySelector('[slot="comment"]')?.innerHTML || ''; // Get timestamp from faceplate-timeago element const timeElement = comment.querySelector('faceplate-timeago'); const timestamp = timeElement?.getAttribute('ts') || ''; const date = timestamp ? new Date(timestamp).toISOString().split('T')[0] : ''; // For top-level comments, close all previous blockquotes and start fresh if (depth === 0) { // Close all open blockquotes while (blockquoteStack.length > 0) { html += '</blockquote>'; blockquoteStack.pop(); } html += '<blockquote>'; blockquoteStack = [0]; currentDepth = 0; } // For nested comments else { // If we're moving back up the tree if (depth < currentDepth) { // Close blockquotes until we reach the current depth while (blockquoteStack.length > 0 && blockquoteStack[blockquoteStack.length - 1] >= depth) { html += '</blockquote>'; blockquoteStack.pop(); } } // If we're going deeper else if (depth > currentDepth) { html += '<blockquote>'; blockquoteStack.push(depth); } // If we're at the same depth, no need to close or open blockquotes } html += `<div class="comment"> <div class="comment-metadata"> <span class="comment-author"><strong>${author}</strong></span> • <a href="https://reddit.com${permalink}" class="comment-link">${score} points</a> • <span class="comment-date">${date}</span> </div> <div class="comment-content">${content}</div> </div>`; currentDepth = depth; } // Close any remaining blockquotes while (blockquoteStack.length > 0) { html += '</blockquote>'; blockquoteStack.pop(); } return html; } } exports.RedditExtractor = RedditExtractor; //# sourceMappingURL=reddit.js.map