UNPKG

article-summarizer-jp

Version:

CLI tool for summarizing web articles in Japanese using Anthropic Claude API. Fetches content from URLs and generates both 3-line summaries and full translations in polite Japanese.

262 lines 10.3 kB
import { extract } from '@extractus/article-extractor'; import { JSDOM } from 'jsdom'; function suppressConsole(fn) { const originalError = console.error; const originalWarn = console.warn; const originalLog = console.log; // Suppress console output during JSDOM operations console.error = () => { }; console.warn = () => { }; console.log = () => { }; try { return fn(); } finally { // Restore original console methods console.error = originalError; console.warn = originalWarn; console.log = originalLog; } } function extractArticleHtml(html) { const dom = suppressConsole(() => new JSDOM(html)); const document = dom.window.document; // Remove script, style, and other non-content elements const elementsToRemove = document.querySelectorAll('script, style, noscript, nav, header, footer, aside, .navigation, .nav, .menu, .sidebar, .ads, .advertisement'); elementsToRemove.forEach((el) => el.remove()); // Try to find main content areas const contentSelectors = [ 'article', 'main', '[role="main"]', '.main-content', '.content', '#content', '.post-content', '.entry-content', '.article-body', '.story-body', // WIRED.jp and similar styled-components sites '.body__inner-container', '[class*="BodyWrapper"]', '[class*="article__body"]', '[class*="ArticleBody"]', ]; for (const selector of contentSelectors) { const element = document.querySelector(selector); if (element?.innerHTML && element.innerHTML.length > 100) { return element.innerHTML; } } // Fallback to body content return document.body?.innerHTML || html; } function cleanExtractedContent(content, debug = false) { const lines = content .split('\n') .map((line) => line.trim()) .filter((line) => line.length > 0); const cleanedLines = []; if (debug) { console.log(`[DEBUG cleaner] 入力行数: ${lines.length}`); if (lines.length > 0) { console.log(`[DEBUG cleaner] 最初の行の長さ: ${lines[0].length}文字`); } } // Common patterns to exclude const excludePatterns = [ /^(advertisement|ad|sponsored|関連記事|広告|PR|プロモーション)/i, /^(share|シェア|tweet|ツイート|facebook|line)/i, /^(cookie|クッキー|privacy|プライバシー|利用規約|terms)/i, /^(subscribe|登録|newsletter|メルマガ)/i, /^(follow|フォロー|social|sns)/i, /^(more\s+(news|articles)|その他のニュース|関連記事)/i, /^(navigation|ナビゲーション|menu|メニュー)/i, /^(category|カテゴリ|tag|タグ)/i, /^(date|日時|time|published|投稿日)/i, /^(author|著者|writer|筆者)/i, /^(source|出典|via|引用元)/i, /^(read\s+more|続きを読む|もっと見る)/i, /^(back\s+to|戻る|トップに戻る)/i, /^\d{4}[-\/年]\d{1,2}[-\/月]\d{1,2}/, // Date patterns /^[\d\s\-\/年月日時分秒:]+$/, // Time/date only lines /^[ \s]*$/, // Empty or whitespace-only lines ]; // Content length filters const minLineLength = 10; const maxRepeatedChars = 5; // Long lines (like article paragraphs) should not be filtered by repeated chars const longLineThreshold = 100; for (const line of lines) { // Skip lines that are too short if (line.length < minLineLength) { if (debug) console.log(`[DEBUG cleaner] 短すぎてスキップ: "${line.substring(0, 50)}..."`); continue; } // Skip lines with too many repeated characters (but not for long content lines) if (line.length < longLineThreshold && hasRepeatedChars(line, maxRepeatedChars)) { if (debug) console.log(`[DEBUG cleaner] 繰り返し文字でスキップ: "${line.substring(0, 50)}..."`); continue; } // Skip lines matching exclude patterns if (excludePatterns.some((pattern) => pattern.test(line))) { if (debug) console.log(`[DEBUG cleaner] パターンマッチでスキップ: "${line.substring(0, 50)}..."`); continue; } // Skip lines that are likely navigation or UI elements if (isLikelyUIElement(line)) { if (debug) console.log(`[DEBUG cleaner] UI要素としてスキップ: "${line.substring(0, 50)}..."`); continue; } cleanedLines.push(line); } if (debug) { console.log(`[DEBUG cleaner] クリーニング後の行数: ${cleanedLines.length}`); } return cleanedLines.join('\n\n'); } function hasRepeatedChars(text, maxRepeated) { for (let i = 0; i < text.length - maxRepeated; i++) { let count = 1; for (let j = i + 1; j < text.length && text[j] === text[i]; j++) { count++; } if (count > maxRepeated) return true; } return false; } function isLikelyUIElement(line) { // Check for UI-like patterns const uiPatterns = [ /^[<>«»‹›\[\](){}]+$/, // Bracket-only content /^[\d\s\-\+\*\.]+$/, // Number/symbol-only content /^[ \s]*[▼▲►◄△▽]+[ \s]*$/, // Arrow symbols /^[ \s]*[■□●○◆◇★☆]+[ \s]*$/, // Symbol bullets /^(click|クリック|tap|タップ|press|プレス)/i, /^(here|こちら|ここ|above|below|上記|下記)/i, ]; return uiPatterns.some((pattern) => pattern.test(line)); } export async function extractTextContent(html, debug = false) { // Extract article HTML for LLM processing const htmlContent = extractArticleHtml(html); try { // Try using article-extractor first if (debug) { console.log('[DEBUG extractor] article-extractorを試行中...'); } const article = await extract(html); if (debug) { console.log(`[DEBUG extractor] article-extractor結果: ${article?.content?.length || 0}文字`); } if (article?.content && article.content.length > 100) { // Clean up the content by removing HTML tags const dom = suppressConsole(() => new JSDOM(article.content)); const textContent = dom.window.document.body.textContent || ''; if (debug) { console.log(`[DEBUG extractor] HTML除去後: ${textContent.length}文字`); } // Apply content cleaning const cleanedContent = cleanExtractedContent(textContent, debug); if (debug) { console.log(`[DEBUG extractor] クリーニング後: ${cleanedContent.length}文字`); } if (cleanedContent.length > 100) { return { title: article.title || 'Untitled', content: cleanedContent.trim(), htmlContent, }; } } } catch (error) { if (debug) { console.log(`[DEBUG extractor] article-extractorエラー: ${error}`); } // Continue to fallback } if (debug) { console.log('[DEBUG extractor] フォールバック抽出を開始...'); } // Fallback to basic extraction const dom = suppressConsole(() => new JSDOM(html)); const document = dom.window.document; // Extract title const title = document.querySelector('title')?.textContent || document.querySelector('h1')?.textContent || document.querySelector('meta[property="og:title"]')?.getAttribute('content') || 'Untitled'; // Remove script and style elements const scripts = document.querySelectorAll('script, style, noscript'); scripts.forEach((el) => el.remove()); // Try to find main content areas const contentSelectors = [ 'article', 'main', '[role="main"]', '.main-content', '.content', '#content', '.post-content', '.entry-content', '.article-body', '.story-body', // WIRED.jp and similar styled-components sites '.body__inner-container', '[class*="BodyWrapper"]', '[class*="article__body"]', '[class*="ArticleBody"]', ]; let content = ''; for (const selector of contentSelectors) { const element = document.querySelector(selector); if (element?.textContent && element.textContent.trim().length > 100) { content = element.textContent; if (debug) { console.log(`[DEBUG extractor] セレクタ "${selector}" で${content.length}文字を抽出`); } break; } } // If no content found, try extracting all <p> tags (fallback for SPAs) if (!content || content.trim().length < 100) { if (debug) { console.log('[DEBUG extractor] <p>タグからの抽出を試行...'); } const paragraphs = document.querySelectorAll('p'); const paragraphTexts = []; paragraphs.forEach((p) => { const text = p.textContent?.trim(); if (text && text.length > 20) { paragraphTexts.push(text); } }); if (paragraphTexts.length > 0) { content = paragraphTexts.join('\n\n'); if (debug) { console.log(`[DEBUG extractor] <p>タグから${paragraphTexts.length}段落、${content.length}文字を抽出`); } } } // If still no content found, try to get from body if (!content) { content = document.body?.textContent || ''; if (debug) { console.log(`[DEBUG extractor] bodyから${content.length}文字を抽出`); } } // Clean up whitespace and apply content cleaning const cleanedContent = cleanExtractedContent(content, debug); return { title: title.trim(), content: cleanedContent.trim(), htmlContent, }; } //# sourceMappingURL=extractor.js.map