article-summarizer-jp
Version:
CLI tool for summarizing web articles in Japanese using Anthropic Claude API. Fetches content from URLs and generates both 3-line summaries and full translations in polite Japanese.
290 lines • 11.5 kB
JavaScript
import fetch from 'node-fetch';
import { launch } from 'puppeteer';
import { extractTextContent } from './extractor.js';
import PDFParser from 'pdf2json';
import { escape as htmlEscape } from 'html-escaper';
function isPdfUrl(url) {
try {
const parsedUrl = new URL(url);
// Check file extension in pathname
if (parsedUrl.pathname.toLowerCase().endsWith('.pdf')) {
return true;
}
// Check for common PDF service patterns
const hostname = parsedUrl.hostname.toLowerCase();
if (hostname === 'arxiv.org' && parsedUrl.pathname.includes('/pdf/')) {
return true;
}
return false;
}
catch {
// More specific fallback: check for .pdf followed by query/fragment/end
return url.toLowerCase().match(/\.pdf(\?|#|$)/) !== null;
}
}
async function fetchPdfContent(url) {
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; ArticleSummarizer/1.0)',
},
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const buffer = Buffer.from(await response.arrayBuffer());
return new Promise((resolve, reject) => {
// Aggressively suppress all console output during PDF parsing
const originalConsoleLog = console.log;
const originalConsoleWarn = console.warn;
const originalConsoleError = console.error;
const originalStdoutWrite = process.stdout.write;
const originalStderrWrite = process.stderr.write;
// Override all console methods
console.log = () => { };
console.warn = () => { };
console.error = () => { };
// Override stdout/stderr writes
process.stdout.write = () => true;
process.stderr.write = () => true;
const pdfParser = new PDFParser();
const cleanup = () => {
// Restore all original methods
console.log = originalConsoleLog;
console.warn = originalConsoleWarn;
console.error = originalConsoleError;
process.stdout.write = originalStdoutWrite;
process.stderr.write = originalStderrWrite;
};
pdfParser.on('pdfParser_dataError', (errData) => {
cleanup();
reject(new Error(`PDF parsing error: ${errData.parserError}`));
});
pdfParser.on('pdfParser_dataReady', (pdfData) => {
cleanup();
try {
// Extract text from PDF data
let content = '';
if (pdfData.Pages) {
for (const page of pdfData.Pages) {
if (page.Texts) {
for (const textItem of page.Texts) {
if (textItem.R) {
for (const run of textItem.R) {
if (run.T) {
// Decode URI component and replace encoded spaces
const decodedText = decodeURIComponent(run.T);
content += decodedText + ' ';
}
}
}
}
content += '\n';
}
}
}
const title = extractTitleFromPdfText(content) || 'PDF Document';
// Create a simple HTML structure for consistency with proper escaping
const htmlContent = `<html><head><title>${htmlEscape(title)}</title></head><body><pre>${htmlEscape(content)}</pre></body></html>`;
resolve({
title,
content: content.trim(),
extractedUrl: url,
htmlContent,
});
}
catch (error) {
reject(new Error(`PDF text extraction error: ${error}`));
}
});
// Parse the PDF buffer
pdfParser.parseBuffer(buffer);
});
}
const TITLE_SEARCH_LINES = 10;
const MIN_TITLE_LENGTH = 10;
const MAX_TITLE_LENGTH = 200;
function extractTitleFromPdfText(text) {
const lines = text.split('\n').filter((line) => line.trim().length > 0);
// Try to find the first substantial line as title
for (const line of lines.slice(0, TITLE_SEARCH_LINES)) {
const trimmed = line.trim();
if (trimmed.length > MIN_TITLE_LENGTH && trimmed.length < MAX_TITLE_LENGTH) {
// Avoid lines that look like headers, footers, or page numbers
if (!/^\d+$/.test(trimmed) && !trimmed.includes('Page ') && !trimmed.includes('©')) {
return trimmed;
}
}
}
return null;
}
export async function fetchContent(url, isSilent = false, debug = false) {
// Validate and normalize URL
let parsedUrl;
try {
parsedUrl = new URL(url);
}
catch {
throw new Error('Invalid URL provided');
}
// Check if it's a PDF URL and handle it specially
if (isPdfUrl(parsedUrl.toString())) {
if (!isSilent) {
console.log(' 📄 PDFファイルを検出しました。PDF解析を開始します...');
}
return await fetchPdfContent(parsedUrl.toString());
}
let fallbackReason = '';
// Try regular fetch first
try {
if (debug) {
console.log('[DEBUG] 通常のfetchを試行中...');
}
const response = await fetch(parsedUrl.toString(), {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; ArticleSummarizer/1.0)',
},
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const html = await response.text();
if (debug) {
console.log(`[DEBUG] 取得したHTML長: ${html.length}文字`);
}
const { title, content, htmlContent } = await extractTextContent(html, debug);
if (debug) {
console.log(`[DEBUG] 抽出したコンテンツ長: ${content.length}文字`);
console.log(`[DEBUG] タイトル: ${title}`);
}
if (content.length > 100) {
return { title, content, extractedUrl: parsedUrl.toString(), htmlContent };
}
fallbackReason = `コンテンツが不十分 (${content.length}文字)`;
}
catch (error) {
fallbackReason = `fetchエラー: ${error instanceof Error ? error.message : String(error)}`;
}
// Fallback to headless browser
if (!isSilent) {
console.log(` 🔄 ${fallbackReason} - CSR(ヘッドレスブラウザ)を実行中...`);
}
if (debug) {
console.log(`[DEBUG] フォールバック理由: ${fallbackReason}`);
}
if (debug) {
console.log('[DEBUG] Puppeteerを起動中...');
}
// Fallback to headless browser
const browser = await launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-logging',
'--log-level=3',
'--silent',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-web-security',
'--disable-features=TranslateUI',
'--disable-ipc-flooding-protection',
'--disable-breakpad',
'--disable-client-side-phishing-detection',
'--disable-sync',
'--disable-default-apps',
'--disable-extensions',
],
pipe: true, // Use pipes instead of shared memory
dumpio: false, // Disable dumping of stdout/stderr
});
try {
const page = await browser.newPage();
// Set viewport to simulate a real browser
await page.setViewport({ width: 1920, height: 1080 });
// Completely suppress all console output from the browser page
page.on('console', () => { });
page.on('pageerror', () => { });
page.on('requestfailed', () => { });
page.on('response', () => { });
page.on('requestfinished', () => { });
page.on('load', () => { });
page.on('domcontentloaded', () => { });
// Disable JavaScript console output by overriding console methods
await page.evaluateOnNewDocument(() => {
const noop = () => { };
window.console = {
log: noop,
error: noop,
warn: noop,
info: noop,
debug: noop,
trace: noop,
dir: noop,
dirxml: noop,
group: noop,
groupCollapsed: noop,
groupEnd: noop,
time: noop,
timeEnd: noop,
timeStamp: noop,
table: noop,
clear: noop,
count: noop,
assert: noop,
profile: noop,
profileEnd: noop,
};
});
// Set more realistic user agent to avoid bot detection
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
if (debug) {
console.log('[DEBUG] ページへ移動中...');
}
// Navigate to the URL with increased timeout
await page.goto(parsedUrl.toString(), {
waitUntil: 'networkidle2',
timeout: 30000,
});
if (debug) {
console.log('[DEBUG] ページ読み込み完了、コンテンツセレクタを待機中...');
}
// Wait for common content selectors
await page
.waitForSelector('article, main, .content, #content, body', {
timeout: 5000,
})
.catch(() => {
if (debug) {
console.log('[DEBUG] コンテンツセレクタが見つかりませんでした');
}
});
// Additional wait for JavaScript rendering
await new Promise((resolve) => setTimeout(resolve, 2000));
if (debug) {
console.log('[DEBUG] 追加待機完了、HTMLを取得中...');
}
// Get page content
const html = await page.content();
if (debug) {
console.log(`[DEBUG] Puppeteerで取得したHTML長: ${html.length}文字`);
}
const { title, content, htmlContent } = await extractTextContent(html, debug);
if (debug) {
console.log(`[DEBUG] Puppeteerで抽出したコンテンツ長: ${content.length}文字`);
console.log(`[DEBUG] タイトル: ${title}`);
if (content.length < 500) {
console.log(`[DEBUG] コンテンツプレビュー: ${content.substring(0, 200)}...`);
}
}
if (content.length < 100) {
throw new Error('Could not extract meaningful content from the page');
}
return { title, content, extractedUrl: parsedUrl.toString(), htmlContent };
}
finally {
await browser.close();
}
}
//# sourceMappingURL=fetcher.js.map