UNPKG

gatsby-source-guru

Version:

A Gatsby source plugin for fetching content from GetGuru knowledge base and creating pages from your cards

391 lines (324 loc) 13.6 kB
// Content processing layer for gatsby-source-guru const fs = require('fs') const path = require('path') const crypto = require('crypto') const he = require('he') const TurndownService = require('turndown') const { GURU_LINK_PATTERNS, createSlugFromTitle, getFileExtensionFromContentType, isImageBySignature, ensureDirectoryExists, extractFileUrls, formatUserName } = require('./utils') const { downloadFile } = require('./api') /** * Create card map for link conversion */ const createCardMap = (allCards) => { const cardMap = new Map() allCards.forEach(card => { if (card.id) { const slug = createSlugFromTitle(card.preferredPhrase || card.title) // If slug is empty (only special characters), fall back to card ID const finalSlug = slug || card.id cardMap.set(card.id, `/pages/${finalSlug}/`) } }) return cardMap } /** * Convert internal Guru links to local links */ const convertInternalLinks = (content, currentCard, allCards) => { if (!content || !allCards) { console.log(`convertInternalLinks: missing content or allCards - content: ${!!content}, allCards: ${!!allCards}`) return content } const cardMap = createCardMap(allCards) let processedContent = content let linksFound = 0 // Find anchor tags with data-ghq-guru-card-id attributes (flexible order) const anchorTagRegex = /<a[^>]*data-ghq-guru-card-id=["']([^"']+)["'][^>]*>/gi let match while ((match = anchorTagRegex.exec(processedContent)) !== null) { const fullAnchorTag = match[0] const cardId = match[1] if (cardMap.has(cardId)) { const localPath = cardMap.get(cardId) // Replace the href in the anchor tag, handling both with and without href const updatedAnchorTag = fullAnchorTag.includes('href=') ? fullAnchorTag.replace(/href=["']https:\/\/(app\.)?getguru\.com\/card\/[^"']*["']/i, `href="${localPath}"`) : fullAnchorTag.replace(/(<a[^>]*)>/i, `$1 href="${localPath}">`) processedContent = processedContent.replace(fullAnchorTag, updatedAnchorTag) linksFound++ } } // Reset regex for next usage anchorTagRegex.lastIndex = 0 // Also process standalone URLs using the original patterns GURU_LINK_PATTERNS.forEach(pattern => { const matches = [...processedContent.matchAll(pattern)] matches.forEach(match => { const cardId = match[1] if (cardMap.has(cardId)) { const localPath = cardMap.get(cardId) const fullUrl = match[0] // Replace standalone URLs processedContent = processedContent.replace( new RegExp(fullUrl.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'gi'), localPath ) linksFound++ } }) }) if (linksFound > 0) { console.log(`Converted ${linksFound} internal links for card: ${currentCard.title || currentCard.id}`) } return processedContent } /** * Save downloaded file to disk */ const saveDownloadedFile = (url, buffer, contentType, downloadDir) => { const urlPath = new URL(url).pathname // Decode URL-encoded characters in filename let originalFilename = decodeURIComponent(path.basename(urlPath)) || 'attachment' let fileExtension = path.extname(originalFilename) if (!fileExtension) { fileExtension = getFileExtensionFromContentType(contentType) } const baseName = path.basename(originalFilename, path.extname(originalFilename)) || 'file' const hash = crypto.createHash('md5').update(url).digest('hex').substring(0, 8) const filename = `${baseName}_${hash}${fileExtension}` const filepath = path.join(downloadDir, filename) ensureDirectoryExists(downloadDir) fs.writeFileSync(filepath, buffer) console.log(`Downloaded file: ${filename} (${buffer.length} bytes, content-type: ${contentType})`) console.log(`File appears to be an image: ${isImageBySignature(buffer)}`) return { filename, filepath, originalUrl: url, size: buffer.length, mimeType: contentType } } /** * Process attachments for a card */ const processAttachments = async (card, headers, attachmentDir) => { if (!card.content) return { processedContent: card.content || '', attachedFiles: [] } console.log(`Processing attachments for card: ${card.title || card.id}`) let processedContent = card.content let attachedFiles = [] const { imageUrls, otherFileUrls } = extractFileUrls(card.content) // Process images for (const imageUrl of imageUrls) { if (!imageUrl.startsWith('http')) continue let downloadedImage = null try { if (imageUrl.includes('content.api.getguru.com/files/view/')) { // Test if Guru file is publicly accessible const testResult = await downloadFile(imageUrl, { 'Accept': '*/*' }) console.log(`Public test - Status: ${testResult ? 'OK' : 'Failed'}, Content-Type: ${testResult?.contentType || 'N/A'}`) if (testResult && testResult.contentType !== 'text/html') { console.log(`File is public, downloading: ${imageUrl}`) downloadedImage = saveDownloadedFile(imageUrl, testResult.buffer, testResult.contentType, attachmentDir) } else { console.log(`File is not public or returns HTML, keeping original URL: ${imageUrl}`) } } else { const result = await downloadFile(imageUrl, headers) if (result) { downloadedImage = saveDownloadedFile(imageUrl, result.buffer, result.contentType, attachmentDir) } } } catch (error) { console.log(`Failed to download image ${imageUrl}: ${error.message}`) // Continue processing other files } if (downloadedImage) { attachedFiles.push(downloadedImage) const localPath = `/guru-attachments/${downloadedImage.filename}` processedContent = processedContent.replace( new RegExp(imageUrl.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), localPath ) } } // Process other files if (otherFileUrls.length > 0) { console.log(`Found ${otherFileUrls.length} non-image attachment(s) in card`) for (const fileUrl of otherFileUrls) { try { const result = await downloadFile(fileUrl, headers) if (result) { const downloadedFile = saveDownloadedFile(fileUrl, result.buffer, result.contentType, attachmentDir) attachedFiles.push(downloadedFile) } } catch (error) { console.log(`Failed to download file ${fileUrl}: ${error.message}`) // Continue processing other files } } } if (attachedFiles.length > 0) { console.log(`Downloaded ${attachedFiles.length} file(s) total`) } return { processedContent, attachedFiles } } /** * Initialize and configure TurndownService for HTML to Markdown conversion */ const createTurndownService = () => { const turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }) // Add Guru code block handling turndownService.addRule('guruCodeBlock', { filter: function(node) { return node.nodeName === 'PRE' && node.classList.contains('ghq-card-content__code-block') }, replacement: function(content, node) { console.log('DEBUG: GURU CODE BLOCK CONVERSION RUNNING!') // Extract language from data attribute const language = node.getAttribute('data-ghq-code-block-syntax') || node.getAttribute('data-ghq-code-block-prism') || '' // Get all code line elements const codeLines = Array.from(node.querySelectorAll('.ghq-card-content__code-block-line')) if (codeLines.length === 0) { // Fallback to regular content if no line elements found const codeContent = (node.textContent || '').trim() return `\n\n\`\`\`${language.toLowerCase()}\n${codeContent}\n\`\`\`\n\n` } // Extract text content from each line const lines = codeLines.map(line => line.textContent || '').join('\n') console.log(`DEBUG: Code block conversion - Language: ${language}, Lines: ${codeLines.length}`) console.log(`DEBUG: Code content preview: ${lines.substring(0, 100)}...`) return `\n\n\`\`\`${language.toLowerCase()}\n${lines}\n\`\`\`\n\n` } }) // Add complete table handling turndownService.addRule('guruTable', { filter: 'table', replacement: function(content, node) { console.log('DEBUG: TABLE CONVERSION RUNNING - FIXED VERSION!') const rows = Array.from(node.querySelectorAll('tr')) if (rows.length === 0) return '' const tableRows = [] rows.forEach((row, rowIndex) => { const cells = Array.from(row.querySelectorAll('td, th')) if (cells.length === 0) return const cellTexts = cells.map(cell => { // Handle paragraphs specially - join with <br> const paragraphs = Array.from(cell.querySelectorAll('p')) let cellContent = '' if (paragraphs.length > 0) { // Join multiple paragraphs with <br> cellContent = paragraphs .map(p => (p.textContent || '').trim()) .filter(text => text.length > 0) .join('<br>') } else if (cell.querySelector('img, a, strong, em, code, ul, ol')) { // If cell has complex content, process it through TurndownService recursively const TurndownService = require('turndown') const cellTurndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }) // Convert HTML to markdown for this cell cellContent = cellTurndown.turndown(cell.innerHTML || '').trim() } else { // Simple text content cellContent = (cell.textContent || '').trim() } // Escape pipe characters for markdown table return cellContent.replace(/\|/g, '\\|') }) // Add the data row tableRows.push('| ' + cellTexts.join(' | ') + ' |') // Add separator after first row (header) if (rowIndex === 0) { tableRows.push('|' + cells.map(() => ' --- ').join('|') + '|') } }) const result = '\n\n' + tableRows.join('\n') + '\n\n' console.log('DEBUG: Table conversion result:', result) return result } }) // Prevent default handling of Guru code block lines (handled by guruCodeBlock rule) turndownService.addRule('guruCodeBlockLine', { filter: function(node) { return node.nodeName === 'CODE' && node.classList.contains('ghq-card-content__code-block-line') }, replacement: function() { return '' } }) // Remove default table element handling to prevent interference turndownService.addRule('tableCell', { filter: ['td', 'th'], replacement: function() { return '' } }) turndownService.addRule('tableRow', { filter: 'tr', replacement: function() { return '' } }) return turndownService } /** * Process a single card's content */ const processCardContent = async (card, allCards, downloadAttachments, attachmentDir, headers) => { let { processedContent, attachedFiles } = { processedContent: card.content || '', attachedFiles: [] } if (downloadAttachments) { const result = await processAttachments(card, headers, attachmentDir) processedContent = result.processedContent attachedFiles = result.attachedFiles } // Convert internal Guru links to local links processedContent = convertInternalLinks(processedContent, card, allCards) // Convert processed HTML content to markdown const turndownService = createTurndownService() // Decode HTML entities before conversion to ensure proper rendering in markdown const decodedContent = processedContent ? he.decode(processedContent) : '' const markdownContent = decodedContent ? turndownService.turndown(decodedContent) : '' return { markdownContent, attachedFiles } } /** * Filter cards based on verification state */ const filterCardsByVerification = (cards, onlyVerified) => { if (!onlyVerified) return cards // Strategy: Include all TRUSTED cards, but for NEEDS_VERIFICATION cards, // only include them if there's no TRUSTED version with the same title const trustedCards = cards.filter(card => card.verificationState === 'TRUSTED') const unverifiedCards = cards.filter(card => card.verificationState === 'NEEDS_VERIFICATION') // Get titles of all trusted cards (normalized), prioritizing preferredPhrase const trustedTitles = new Set(trustedCards.map(card => (card.preferredPhrase || card.title || 'Untitled').toLowerCase().trim() )) // Include unverified cards only if their title doesn't exist in trusted cards const uniqueUnverifiedCards = unverifiedCards.filter(card => { const cardTitle = (card.preferredPhrase || card.title || 'Untitled').toLowerCase().trim() return !trustedTitles.has(cardTitle) }) const filteredCards = [...trustedCards, ...uniqueUnverifiedCards] console.log(`Processed ${filteredCards.length} cards (${trustedCards.length} trusted, ${uniqueUnverifiedCards.length} unique unverified) from ${cards.length} total`) return filteredCards } module.exports = { createCardMap, convertInternalLinks, processAttachments, saveDownloadedFile, createTurndownService, processCardContent, filterCardsByVerification }