UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

163 lines 7.29 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.GitHubExtractor = void 0; const _base_1 = require("./_base"); class GitHubExtractor extends _base_1.BaseExtractor { canExtract() { const githubIndicators = [ 'meta[name="expected-hostname"][content="github.com"]', 'meta[name="octolytics-url"]', 'meta[name="github-keyboard-shortcuts"]', '.js-header-wrapper', '#js-repo-pjax-container', ]; const githubPageIndicators = { issue: [ '[data-testid="issue-metadata-sticky"]', '[data-testid="issue-title"]', ], }; return githubIndicators.some(selector => this.document.querySelector(selector) !== null) && Object.values(githubPageIndicators).some(selectors => selectors.some(selector => this.document.querySelector(selector) !== null)); } extract() { return this.extractIssue(); } extractIssue() { const repoInfo = this.extractRepoInfo(); const issueNumber = this.extractIssueNumber(); let content = ''; // Extract the main issue body first const issueContainer = this.document.querySelector('[data-testid="issue-viewer-issue-container"]'); if (issueContainer) { const issueAuthor = this.extractAuthor(issueContainer, [ 'a[data-testid="issue-body-header-author"]', '.IssueBodyHeaderAuthor-module__authorLoginLink--_S7aT', '.ActivityHeader-module__AuthorLink--iofTU', 'a[href*="/users/"][data-hovercard-url*="/users/"]', 'a[aria-label*="profile"]' ]); const issueTimeElement = issueContainer.querySelector('relative-time'); const issueTimestamp = issueTimeElement?.getAttribute('datetime') || ''; const issueBodyElement = issueContainer.querySelector('[data-testid="issue-body-viewer"] .markdown-body'); if (issueBodyElement) { const bodyContent = this.cleanBodyContent(issueBodyElement); // Add the main issue content += `<div class="issue-author"><strong>${issueAuthor}</strong>`; if (issueTimestamp) { const date = new Date(issueTimestamp); content += ` opened this issue on ${date.toLocaleDateString()}`; } content += `</div>\n\n`; content += `<div class="issue-body">${bodyContent}</div>\n\n`; } } // Extract comments const commentElements = Array.from(this.document.querySelectorAll('[data-wrapper-timeline-id]')); const processedComments = new Set(); commentElements.forEach((commentElement) => { const commentContainer = commentElement.querySelector('.react-issue-comment'); if (!commentContainer) return; const commentId = commentElement.getAttribute('data-wrapper-timeline-id'); if (!commentId || processedComments.has(commentId)) return; processedComments.add(commentId); const author = this.extractAuthor(commentContainer, [ '.ActivityHeader-module__AuthorLink--iofTU', 'a[data-testid="avatar-link"]', 'a[href^="/"][data-hovercard-url*="/users/"]' ]); const timeElement = commentContainer.querySelector('relative-time'); const timestamp = timeElement?.getAttribute('datetime') || ''; const bodyElement = commentContainer.querySelector('.markdown-body'); if (bodyElement) { const bodyContent = this.cleanBodyContent(bodyElement); if (bodyContent) { content += `<div class="comment">\n`; content += `<div class="comment-header"><strong>${author}</strong>`; if (timestamp) { const date = new Date(timestamp); content += ` commented on ${date.toLocaleDateString()}`; } content += `</div>\n`; content += `<div class="comment-body">${bodyContent}</div>\n`; content += `</div>\n\n`; } } }); return { content: content, contentHtml: content, extractedContent: { type: 'issue', issueNumber, repository: repoInfo.repo, owner: repoInfo.owner, }, variables: { title: this.document.title, author: '', site: `GitHub - ${repoInfo.owner}/${repoInfo.repo}`, description: this.createDescription(content), } }; } extractAuthor(container, selectors) { for (const selector of selectors) { const authorLink = container.querySelector(selector); if (authorLink) { const href = authorLink.getAttribute('href'); if (href) { if (href.startsWith('/')) { return href.substring(1); } else if (href.includes('github.com/')) { const match = href.match(/github\.com\/([^\/\?#]+)/); if (match && match[1]) { return match[1]; } } } } } return 'Unknown'; } cleanBodyContent(bodyElement) { const cleanBody = bodyElement.cloneNode(true); cleanBody.querySelectorAll('button, [data-testid*="button"], [data-testid*="menu"]').forEach(el => el.remove()); cleanBody.querySelectorAll('.js-clipboard-copy, .zeroclipboard-container').forEach(el => el.remove()); return cleanBody.innerHTML.trim(); } extractIssueNumber() { // Try URL first (most reliable) const urlMatch = this.url.match(/\/(issues|pull)\/(\d+)/); if (urlMatch) return urlMatch[2]; // Fallback to HTML extraction const titleElement = this.document.querySelector('h1'); const titleMatch = titleElement?.textContent?.match(/#(\d+)/); return titleMatch ? titleMatch[1] : ''; } extractRepoInfo() { // Try URL first (most reliable) const urlMatch = this.url.match(/github\.com\/([^\/]+)\/([^\/]+)/); if (urlMatch) { return { owner: urlMatch[1], repo: urlMatch[2] }; } // Fallback to HTML extraction const titleMatch = this.document.title.match(/([^\/\s]+)\/([^\/\s]+)/); return titleMatch ? { owner: titleMatch[1], repo: titleMatch[2] } : { owner: '', repo: '' }; } createDescription(content) { if (!content) return ''; const tempDiv = this.document.createElement('div'); tempDiv.innerHTML = content; return tempDiv.textContent?.trim() .slice(0, 140) .replace(/\s+/g, ' ') || ''; } } exports.GitHubExtractor = GitHubExtractor; //# sourceMappingURL=github.js.map