UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

github.com/kepano/defuddle

kepano/defuddle

133 lines • 6 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ChatGPTExtractor = void 0; const _conversation_1 = require("./_conversation"); class ChatGPTExtractor extends _conversation_1.ConversationExtractor { constructor(document, url) { super(document, url); this.articles = document.querySelectorAll('article[data-testid^="conversation-turn-"]'); this.footnotes = []; this.footnoteCounter = 0; } canExtract() { return !!this.articles && this.articles.length > 0; } extractMessages() { const messages = []; this.footnotes = []; this.footnoteCounter = 0; if (!this.articles) return messages; this.articles.forEach((article) => { // Get the localized author text from the sr-only heading and clean it const authorElement = article.querySelector('h5.sr-only, h6.sr-only'); const authorText = authorElement?.textContent ?.trim() ?.replace(/:\s*$/, '') // Remove colon and any trailing whitespace || ''; let currentAuthorRole = ''; const authorRole = article.getAttribute('data-message-author-role'); if (authorRole) { currentAuthorRole = authorRole; } let messageContent = article.innerHTML || ''; messageContent = messageContent.replace(/\u200B/g, ''); // Remove specific elements from the message content const tempDiv = document.createElement('div'); tempDiv.innerHTML = messageContent; tempDiv.querySelectorAll('h5.sr-only, h6.sr-only, span[data-state="closed"]').forEach(el => el.remove()); messageContent = tempDiv.innerHTML; // Process inline references using regex to find the containers // Look for spans containing citation links (a[target=_blank][rel=noopener]), replacing entire structure // Also capture optional preceding ZeroWidthSpace const citationPattern = /(&ZeroWidthSpace;)?(<span[^>]*?>\s*<a(?=[^>]*?href="([^"]+)")(?=[^>]*?target="_blank")(?=[^>]*?rel="noopener")[^>]*?>[\s\S]*?<\/a>\s*<\/span>)/gi; messageContent = messageContent.replace(citationPattern, (match, zws, spanStructure, url) => { // url is captured group 3 let domain = ''; let fragmentText = ''; try { // Extract domain without www. domain = new URL(url).hostname.replace(/^www\./, ''); // Extract and decode the fragment text if it exists const hashParts = url.split('#:~:text='); if (hashParts.length > 1) { fragmentText = decodeURIComponent(hashParts[1]); fragmentText = fragmentText.replace(/%2C/g, ','); const parts = fragmentText.split(','); if (parts.length > 1 && parts[0].trim()) { fragmentText = ` — ${parts[0].trim()}...`; } else if (parts[0].trim()) { fragmentText = ` — ${fragmentText.trim()}`; } else { fragmentText = ''; } } } catch (e) { console.error(`Failed to parse URL: ${url}`, e); domain = url; } // Check if this URL already exists in our footnotes let footnoteIndex = this.footnotes.findIndex(fn => fn.url === url); let footnoteNumber; if (footnoteIndex === -1) { this.footnoteCounter++; footnoteNumber = this.footnoteCounter; this.footnotes.push({ url, text: `<a href="${url}">${domain}</a>${fragmentText}` }); } else { footnoteNumber = footnoteIndex + 1; } // Return just the footnote reference, replacing the ZWS (if captured) and the entire span structure return `<sup id="fnref:${footnoteNumber}"><a href="#fn:${footnoteNumber}">${footnoteNumber}</a></sup>`; }); // Clean up any stray empty paragraph tags messageContent = messageContent .replace(/<p[^>]*>\s*<\/p>/g, ''); messages.push({ author: authorText, content: messageContent.trim(), metadata: { role: currentAuthorRole || 'unknown' } }); }); return messages; } getFootnotes() { return this.footnotes; } getMetadata() { const title = this.getTitle(); const messages = this.extractMessages(); return { title, site: 'ChatGPT', url: this.url, messageCount: messages.length, description: `ChatGPT conversation with ${messages.length} messages` }; } getTitle() { // Try to get the page title first const pageTitle = this.document.title?.trim(); if (pageTitle && pageTitle !== 'ChatGPT') { return pageTitle; } // Fall back to first user message const firstUserTurn = this.articles?.item(0)?.querySelector('.text-message'); if (firstUserTurn) { const text = firstUserTurn.textContent || ''; // Truncate to first 50 characters if longer return text.length > 50 ? text.slice(0, 50) + '...' : text; } return 'ChatGPT Conversation'; } } exports.ChatGPTExtractor = ChatGPTExtractor; //# sourceMappingURL=chatgpt.js.map