article-summarizer-jp
Version:
CLI tool for summarizing web articles in Japanese using Anthropic Claude API. Fetches content from URLs and generates both 3-line summaries and full translations in polite Japanese.
185 lines • 6.82 kB
JavaScript
import { extract } from '@extractus/article-extractor';
import { JSDOM } from 'jsdom';
function suppressConsole(fn) {
const originalError = console.error;
const originalWarn = console.warn;
const originalLog = console.log;
// Suppress console output during JSDOM operations
console.error = () => { };
console.warn = () => { };
console.log = () => { };
try {
return fn();
}
finally {
// Restore original console methods
console.error = originalError;
console.warn = originalWarn;
console.log = originalLog;
}
}
function extractArticleHtml(html) {
const dom = suppressConsole(() => new JSDOM(html));
const document = dom.window.document;
// Remove script, style, and other non-content elements
const elementsToRemove = document.querySelectorAll('script, style, noscript, nav, header, footer, aside, .navigation, .nav, .menu, .sidebar, .ads, .advertisement');
elementsToRemove.forEach((el) => el.remove());
// Try to find main content areas
const contentSelectors = [
'article',
'main',
'[role="main"]',
'.main-content',
'.content',
'#content',
'.post-content',
'.entry-content',
'.article-body',
'.story-body',
];
for (const selector of contentSelectors) {
const element = document.querySelector(selector);
if (element?.innerHTML && element.innerHTML.length > 100) {
return element.innerHTML;
}
}
// Fallback to body content
return document.body?.innerHTML || html;
}
function cleanExtractedContent(content) {
const lines = content
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0);
const cleanedLines = [];
// Common patterns to exclude
const excludePatterns = [
/^(advertisement|ad|sponsored|関連記事|広告|PR|プロモーション)/i,
/^(share|シェア|tweet|ツイート|facebook|line)/i,
/^(cookie|クッキー|privacy|プライバシー|利用規約|terms)/i,
/^(subscribe|登録|newsletter|メルマガ)/i,
/^(follow|フォロー|social|sns)/i,
/^(more\s+(news|articles)|その他のニュース|関連記事)/i,
/^(navigation|ナビゲーション|menu|メニュー)/i,
/^(category|カテゴリ|tag|タグ)/i,
/^(date|日時|time|published|投稿日)/i,
/^(author|著者|writer|筆者)/i,
/^(source|出典|via|引用元)/i,
/^(read\s+more|続きを読む|もっと見る)/i,
/^(back\s+to|戻る|トップに戻る)/i,
/^\d{4}[-\/年]\d{1,2}[-\/月]\d{1,2}/, // Date patterns
/^[\d\s\-\/年月日時分秒:]+$/, // Time/date only lines
/^[ \s]*$/, // Empty or whitespace-only lines
];
// Content length filters
const minLineLength = 10;
const maxRepeatedChars = 5;
for (const line of lines) {
// Skip lines that are too short
if (line.length < minLineLength)
continue;
// Skip lines with too many repeated characters
if (hasRepeatedChars(line, maxRepeatedChars))
continue;
// Skip lines matching exclude patterns
if (excludePatterns.some((pattern) => pattern.test(line)))
continue;
// Skip lines that are likely navigation or UI elements
if (isLikelyUIElement(line))
continue;
cleanedLines.push(line);
}
return cleanedLines.join('\n\n');
}
function hasRepeatedChars(text, maxRepeated) {
for (let i = 0; i < text.length - maxRepeated; i++) {
let count = 1;
for (let j = i + 1; j < text.length && text[j] === text[i]; j++) {
count++;
}
if (count > maxRepeated)
return true;
}
return false;
}
function isLikelyUIElement(line) {
// Check for UI-like patterns
const uiPatterns = [
/^[<>«»‹›\[\](){}]+$/, // Bracket-only content
/^[\d\s\-\+\*\.]+$/, // Number/symbol-only content
/^[ \s]*[▼▲►◄△▽]+[ \s]*$/, // Arrow symbols
/^[ \s]*[■□●○◆◇★☆]+[ \s]*$/, // Symbol bullets
/^(click|クリック|tap|タップ|press|プレス)/i,
/^(here|こちら|ここ|above|below|上記|下記)/i,
];
return uiPatterns.some((pattern) => pattern.test(line));
}
export async function extractTextContent(html) {
// Extract article HTML for LLM processing
const htmlContent = extractArticleHtml(html);
try {
// Try using article-extractor first
const article = await extract(html);
if (article?.content && article.content.length > 100) {
// Clean up the content by removing HTML tags
const dom = suppressConsole(() => new JSDOM(article.content));
const textContent = dom.window.document.body.textContent || '';
// Apply content cleaning
const cleanedContent = cleanExtractedContent(textContent);
if (cleanedContent.length > 100) {
return {
title: article.title || 'Untitled',
content: cleanedContent.trim(),
htmlContent,
};
}
}
}
catch {
// Continue to fallback
}
// Fallback to basic extraction
const dom = suppressConsole(() => new JSDOM(html));
const document = dom.window.document;
// Extract title
const title = document.querySelector('title')?.textContent ||
document.querySelector('h1')?.textContent ||
document.querySelector('meta[property="og:title"]')?.getAttribute('content') ||
'Untitled';
// Remove script and style elements
const scripts = document.querySelectorAll('script, style, noscript');
scripts.forEach((el) => el.remove());
// Try to find main content areas
const contentSelectors = [
'article',
'main',
'[role="main"]',
'.main-content',
'.content',
'#content',
'.post-content',
'.entry-content',
'.article-body',
'.story-body',
];
let content = '';
for (const selector of contentSelectors) {
const element = document.querySelector(selector);
if (element?.textContent) {
content = element.textContent;
break;
}
}
// If no content found, try to get from body
if (!content) {
content = document.body?.textContent || '';
}
// Clean up whitespace and apply content cleaning
const cleanedContent = cleanExtractedContent(content);
return {
title: title.trim(),
content: cleanedContent.trim(),
htmlContent,
};
}
//# sourceMappingURL=extractor.js.map