article-summarizer-jp
Version:
CLI tool for summarizing web articles in Japanese using Anthropic Claude API. Fetches content from URLs and generates both 3-line summaries and full translations in polite Japanese.
234 lines (227 loc) • 10.9 kB
JavaScript
import Anthropic from '@anthropic-ai/sdk';
import { config } from './config.js';
import { extractThumbnailFromHtml } from './thumbnailExtractor.js';
function cleanSummaryOutput(rawSummary) {
// Remove common AI explanation patterns from summary
const cleaningPatterns = [
// Remove introductions
/^(Here's a 3-line summary in polite Japanese|以下が3行のまとめです|3行まとめは以下の通りです).*?[::]?\s*\n+/i,
/^(以下に|こちらが).*?3行.*?(まとめ|要約).*?[::]?\s*\n+/i,
// Remove conclusions
/\n+.*?(以上が|これが).*?3行.*?(まとめ|要約).*?$/i,
/\n+.*?となります。?$/i,
];
let cleaned = rawSummary;
// Apply cleaning patterns
for (const pattern of cleaningPatterns) {
cleaned = cleaned.replace(pattern, '');
}
// Remove excessive whitespace
cleaned = cleaned
.replace(/\n{2,}/g, '\n') // Multiple line breaks to single
.replace(/^\s+/, '') // Leading whitespace
.replace(/\s+$/, ''); // Trailing whitespace
return cleaned;
}
function cleanDetailsOutput(rawDetails) {
// Remove HTML tags except for media-related ones
let cleaned = rawDetails;
// Preserve image and video tags by temporarily replacing them
const mediaPlaceholders = {};
let placeholderIndex = 0;
// Preserve markdown images
cleaned = cleaned.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match) => {
const placeholder = `__MEDIA_PLACEHOLDER_${placeholderIndex++}__`;
mediaPlaceholders[placeholder] = match;
return placeholder;
});
// Preserve markdown links for videos
cleaned = cleaned.replace(/\[Video[^]]*\]\([^)]+\)/g, (match) => {
const placeholder = `__MEDIA_PLACEHOLDER_${placeholderIndex++}__`;
mediaPlaceholders[placeholder] = match;
return placeholder;
});
// Remove all HTML tags
cleaned = cleaned.replace(/<[^>]+>/g, '');
// Restore media placeholders
Object.keys(mediaPlaceholders).forEach(placeholder => {
cleaned = cleaned.replace(placeholder, mediaPlaceholders[placeholder]);
});
// Remove common AI explanation patterns
const cleaningPatterns = [
// Remove explanatory introductions
/^(はい、)?以下に?.*?(詳細|内容|説明)を?(日本語で)?.*?(提供|記載|説明)(いたします|します).*?\n\n?/i,
/^記事の詳細.*?[::]?\s*\n+/i,
/^Details of the article.*?[::]?\s*\n+/i,
// Remove meta-commentary
/\n\n?注[::].*$/i,
/\n\n?(以上が|これで).*?(詳細|説明).*?(です|となります)\.?$/i,
];
// Apply cleaning patterns
for (const pattern of cleaningPatterns) {
cleaned = cleaned.replace(pattern, '');
}
// Remove excessive whitespace
cleaned = cleaned
.replace(/\n{3,}/g, '\n\n') // Multiple line breaks to double
.replace(/^\s+/, '') // Leading whitespace
.replace(/\s+$/, ''); // Trailing whitespace
return cleaned;
}
function truncateContent(content, maxTokens = 100000) {
// Claude side prompt overhead (≈2k tokens) を見越して余裕を確保
const reservedPromptTokens = 2000;
const effectiveMaxTokens = maxTokens - reservedPromptTokens;
// More conservative token estimation: 1 token ≈ 3 characters for mixed content
const maxChars = effectiveMaxTokens * 3;
console.log(` 🔍 コンテンツ長: ${content.length}文字, 制限: ${maxChars}文字 (${effectiveMaxTokens}トークン + ${reservedPromptTokens}予約)`);
if (content.length <= maxChars) {
console.log(' ✅ コンテンツは制限内です');
return content;
}
console.log(' ⚠️ コンテンツが長すぎます。切り詰めます...');
// Try to truncate at paragraph or sentence boundaries
const truncated = content.substring(0, maxChars);
// Find the last paragraph break
const lastParagraph = truncated.lastIndexOf('\n\n');
if (lastParagraph > maxChars * 0.8) {
const result = truncated.substring(0, lastParagraph);
console.log(` ✂️ 段落区切りで切り詰め: ${result.length}文字`);
return result;
}
// Find the last sentence break
const lastSentence = Math.max(truncated.lastIndexOf('。'), truncated.lastIndexOf('.'), truncated.lastIndexOf('!'), truncated.lastIndexOf('?'));
if (lastSentence > maxChars * 0.8) {
const result = truncated.substring(0, lastSentence + 1);
console.log(` ✂️ 文区切りで切り詰め: ${result.length}文字`);
return result;
}
// Fallback(上限ギリギリを死守)
const result = truncated.slice(0, maxChars - 3) + '...';
console.log(` ✂️ 文字数制限で切り詰め: ${result.length}文字`);
return result;
}
async function generateCombinedSummaryData(title, truncatedContent, anthropic) {
const tool = {
name: 'extract_article_summary',
description: 'Extract comprehensive article summary data including title translation, summary, and tags in Japanese',
input_schema: {
type: 'object',
properties: {
translatedTitle: {
type: 'string',
description: 'Article title translated to natural Japanese. If the original title is already in Japanese, return it as-is. Always respond in Japanese only.'
},
summary: {
type: 'string',
description: 'Exactly 3 lines of concise Japanese summary in polite form (ですます調). Each line should capture a key point. No empty lines between the 3 lines.'
},
tags: {
type: 'array',
items: {
type: 'string'
},
description: 'Array of 3-8 relevant tags. Use Japanese for common terms, keep proper nouns in original language. Replace spaces and commas with underscores. Focus on main topics, technologies, concepts, and themes.'
}
},
required: ['translatedTitle', 'summary', 'tags']
}
};
const systemPrompt = `You are an expert Japanese content analyst and translator. You excel at creating concise summaries, natural translations, and relevant tags in Japanese.`;
const userPrompt = `Analyze the following article and extract comprehensive summary data.
**Requirements:**
- Translate title to natural Japanese (if not already Japanese)
- Create exactly 3 lines of summary in polite Japanese (ですます調)
- Generate 3-8 relevant tags using Japanese conventions
- Be concise but informative
- Focus on main ideas and conclusions
Article Title: ${title}
HTML Content:
${truncatedContent}
Use the extract_article_summary tool to provide the structured output.`;
const response = await anthropic.messages.create({
model: 'claude-3-5-sonnet-20241022',
max_tokens: 8192,
temperature: 0.3,
system: systemPrompt,
messages: [{ role: 'user', content: userPrompt }],
tools: [tool],
tool_choice: { type: 'tool', name: 'extract_article_summary' }
});
// Extract tool use result
const toolUse = response.content.find(block => block.type === 'tool_use');
if (!toolUse || toolUse.type !== 'tool_use') {
throw new Error('Tool use response not found');
}
const result = toolUse.input;
// Clean the summary output
const cleanedSummary = cleanSummaryOutput(result.summary);
return {
translatedTitle: result.translatedTitle.trim() || title,
summary: cleanedSummary,
tags: result.tags || []
};
}
// LLM-based thumbnail extraction removed - now using HTML parsing approach
async function generateDetails(title, truncatedContent, anthropic) {
const systemPrompt = `You are an expert Japanese content analyst and translator. You can analyze and translate content from any language into Japanese. Your specialty is creating detailed, comprehensive descriptions of articles in Japanese while preserving key information and media elements.`;
const userPrompt = `以下の記事コンテンツの詳細な日本語の説明を作成してください。
**要件:**
- メインコンテンツの詳細なカバレッジを提供(完全な翻訳ではなく、包括的な詳細)
- 丁寧な日本語(ですます調)で統一
- 適切なmardown形式で出力
- コンテンツ内の画像や動画をmarkdown要素として含める:
- 画像:  または 
- 動画: [Video: description](url) または埋め込みコードが利用可能な場合
- 重要な技術的詳細、引用、例を保持
- 適切なヘッダーとフォーマットで構造化
- 説明、前置き、メタコメンタリーは含めない
- 詳細コンテンツから直接始める
- どの言語のコンテンツでも日本語で説明してください
記事タイトル: ${title}
HTMLコンテンツ:
${truncatedContent}`;
// Use Sonnet as default for better speed
const model = 'claude-3-5-sonnet-20241022';
const response = await anthropic.messages.create({
model,
max_tokens: 8192,
temperature: 0.1,
system: systemPrompt,
messages: [{ role: 'user', content: userPrompt }]
});
const rawDetails = response.content
.filter(block => block.type === 'text')
.map(block => block.text)
.join('\n')
.trim();
return cleanDetailsOutput(rawDetails);
}
export async function summarizeContent(title, htmlContent, baseUrl, isSilent = false) {
const apiKey = config.getApiKey();
const anthropic = new Anthropic({ apiKey });
try {
// 1回だけコンテンツを切り詰めて両方に使用
const truncatedContent = truncateContent(htmlContent);
if (!isSilent) {
console.log(' 🔄 要約・タイトル・タグを生成中...');
}
const { summary, translatedTitle, tags } = await generateCombinedSummaryData(title, truncatedContent, anthropic);
if (!isSilent) {
console.log(' 🔄 サムネイル画像を抽出中...');
}
const validImageUrl = extractThumbnailFromHtml(htmlContent, baseUrl);
if (!isSilent) {
console.log(' 🔄 詳細を生成中...');
}
const details = await generateDetails(title, truncatedContent, anthropic);
return { summary, details, translatedTitle, tags, validImageUrl };
}
catch (error) {
if (error instanceof Anthropic.APIError) {
throw new Error(`Claude API error: ${error.message}`);
}
throw error;
}
}
//# sourceMappingURL=summarizer.js.map