defuddle
Version:
Extract article content and metadata from web pages.
77 lines • 3.26 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.ConversationExtractor = void 0;
const _base_1 = require("./_base");
const defuddle_1 = require("../defuddle");
class ConversationExtractor extends _base_1.BaseExtractor {
getFootnotes() {
return [];
}
extract() {
const messages = this.extractMessages();
const metadata = this.getMetadata();
const footnotes = this.getFootnotes();
const rawContentHtml = this.createContentHtml(messages, footnotes);
// Create a temporary document to run Defuddle on our content
const tempDoc = document.implementation.createHTMLDocument();
const container = tempDoc.createElement('article');
container.innerHTML = rawContentHtml;
tempDoc.body.appendChild(container);
// Run Defuddle on our formatted content
const defuddled = new defuddle_1.Defuddle(tempDoc).parse();
const contentHtml = defuddled.content;
return {
content: contentHtml,
contentHtml: contentHtml,
extractedContent: {
messageCount: messages.length.toString(),
},
variables: {
title: metadata.title || 'Conversation',
site: metadata.site,
description: metadata.description || `${metadata.site} conversation with ${messages.length} messages`,
wordCount: defuddled.wordCount?.toString() || '',
}
};
}
createContentHtml(messages, footnotes) {
const messagesHtml = messages.map((message, index) => {
const timestampHtml = message.timestamp ?
`<div class="message-timestamp">${message.timestamp}</div>` : '';
// Check if content already has paragraph tags
const hasParagraphs = /<p[^>]*>[\s\S]*?<\/p>/i.test(message.content);
const contentHtml = hasParagraphs ? message.content : `<p>${message.content}</p>`;
// Add metadata to data attributes
const dataAttributes = message.metadata ?
Object.entries(message.metadata)
.map(([key, value]) => `data-${key}="${value}"`)
.join(' ') : '';
return `
<div class="message message-${message.author.toLowerCase()}" ${dataAttributes}>
<div class="message-header">
<p class="message-author"><strong>${message.author}</strong></p>
${timestampHtml}
</div>
<div class="message-content">
${contentHtml}
</div>
</div>${index < messages.length - 1 ? '\n<hr>' : ''}`;
}).join('\n').trim();
// Add footnotes section if we have any
const footnotesHtml = footnotes.length > 0 ? `
<div id="footnotes">
<ol>
${footnotes.map((footnote, index) => `
<li class="footnote" id="fn:${index + 1}">
<p>
<a href="${footnote.url}" target="_blank">${footnote.text}</a> <a href="#fnref:${index + 1}" class="footnote-backref">↩</a>
</p>
</li>
`).join('')}
</ol>
</div>` : '';
return `${messagesHtml}\n${footnotesHtml}`.trim();
}
}
exports.ConversationExtractor = ConversationExtractor;
//# sourceMappingURL=_conversation.js.map
;