article-summarizer-jp
Version:
CLI tool for summarizing web articles in Japanese using Anthropic Claude API. Fetches content from URLs and generates both 3-line summaries and full translations in polite Japanese.
191 lines • 7.14 kB
JavaScript
import { JSDOM } from 'jsdom';
function suppressConsole(fn) {
const originalError = console.error;
const originalWarn = console.warn;
const originalLog = console.log;
// Suppress console output during JSDOM operations
console.error = () => { };
console.warn = () => { };
console.log = () => { };
try {
return fn();
}
finally {
// Restore original console methods
console.error = originalError;
console.warn = originalWarn;
console.log = originalLog;
}
}
function convertRelativeToAbsolute(imageUrl, baseUrl) {
try {
// If already absolute, return as is
if (imageUrl.startsWith('http://') || imageUrl.startsWith('https://')) {
return imageUrl;
}
// Handle protocol-relative URLs
if (imageUrl.startsWith('//')) {
const parsedBase = new URL(baseUrl);
return `${parsedBase.protocol}${imageUrl}`;
}
// Convert relative to absolute
return new URL(imageUrl, baseUrl).href;
}
catch {
return imageUrl; // Return original if conversion fails
}
}
function filterUnwantedImages(images) {
return images.filter(image => {
const url = image.url.toLowerCase();
const alt = (image.alt || '').toLowerCase();
const className = (image.className || '').toLowerCase();
// Filter out data URLs
if (url.startsWith('data:'))
return false;
// Filter out very small images
if ((image.width && image.width < 100) || (image.height && image.height < 100)) {
return false;
}
// Filter out common unwanted patterns in URL
const unwantedUrlPatterns = [
'/favicon', '/icon', '/logo', '/avatar', '/ad/', '/ads/',
'favicon.', 'logo.', 'icon.', 'avatar.', 'sprite.',
'placeholder', 'default', 'thumb', 'mini'
];
if (unwantedUrlPatterns.some(pattern => url.includes(pattern))) {
return false;
}
// Filter out unwanted alt text
const unwantedAltPatterns = [
'icon', 'logo', 'avatar', 'ad', 'advertisement', 'sponsor',
'favicon', 'button', 'arrow', 'bullet'
];
if (unwantedAltPatterns.some(pattern => alt.includes(pattern))) {
return false;
}
// Filter out unwanted class names
const unwantedClassPatterns = [
'icon', 'logo', 'avatar', 'ad', 'advertisement',
'favicon', 'sprite', 'button'
];
if (unwantedClassPatterns.some(pattern => className.includes(pattern))) {
return false;
}
return true;
});
}
function extractImagesFromHtml(html, baseUrl) {
const dom = suppressConsole(() => new JSDOM(html));
const document = dom.window.document;
const images = [];
// Extract meta images (highest priority)
const metaSelectors = [
'meta[property="og:image"]',
'meta[name="twitter:image"]',
'meta[property="og:image:url"]',
'link[rel="image_src"]'
];
metaSelectors.forEach(selector => {
const element = document.querySelector(selector);
if (element) {
const url = element.getAttribute('content') || element.getAttribute('href');
if (url) {
images.push({
url: convertRelativeToAbsolute(url, baseUrl),
source: 'meta'
});
}
}
});
// Extract images from article content areas
const articleSelectors = [
'article img',
'main img',
'.content img',
'.article-body img',
'.post-content img',
'.entry-content img',
'.story-body img'
];
articleSelectors.forEach(selector => {
const imgElements = document.querySelectorAll(selector);
imgElements.forEach((img) => {
const imgElement = img;
const src = imgElement.getAttribute('src') || imgElement.getAttribute('data-src');
if (src) {
images.push({
url: convertRelativeToAbsolute(src, baseUrl),
width: imgElement.width || parseInt(imgElement.getAttribute('width') || '0'),
height: imgElement.height || parseInt(imgElement.getAttribute('height') || '0'),
alt: imgElement.alt,
className: imgElement.className,
source: 'article'
});
}
});
});
// Extract all other images as fallback
const allImages = document.querySelectorAll('img');
allImages.forEach((img) => {
const imgElement = img;
const src = imgElement.getAttribute('src') || imgElement.getAttribute('data-src');
if (src) {
const absoluteUrl = convertRelativeToAbsolute(src, baseUrl);
// Only add if not already added from article areas
const alreadyExists = images.some(existing => existing.url === absoluteUrl);
if (!alreadyExists) {
images.push({
url: absoluteUrl,
width: imgElement.width || parseInt(imgElement.getAttribute('width') || '0'),
height: imgElement.height || parseInt(imgElement.getAttribute('height') || '0'),
alt: imgElement.alt,
className: imgElement.className,
source: 'general'
});
}
}
});
return images;
}
function selectBestThumbnail(images) {
if (images.length === 0)
return undefined;
// Priority 1: Meta images
const metaImages = images.filter(img => img.source === 'meta');
if (metaImages.length > 0) {
return metaImages[0].url;
}
// Priority 2: Article images
const articleImages = images.filter(img => img.source === 'article');
if (articleImages.length > 0) {
// Prefer larger images
const largeImages = articleImages.filter(img => (img.width && img.width > 300) || (img.height && img.height > 300));
if (largeImages.length > 0) {
return largeImages[0].url;
}
return articleImages[0].url;
}
// Priority 3: General images (prefer larger ones)
const generalImages = images.filter(img => img.source === 'general');
if (generalImages.length > 0) {
const largeImages = generalImages.filter(img => (img.width && img.width > 300) || (img.height && img.height > 300));
if (largeImages.length > 0) {
return largeImages[0].url;
}
return generalImages[0].url;
}
return undefined;
}
export function extractThumbnailFromHtml(html, baseUrl) {
try {
const allImages = extractImagesFromHtml(html, baseUrl);
const filteredImages = filterUnwantedImages(allImages);
return selectBestThumbnail(filteredImages);
}
catch (error) {
console.error('Error extracting thumbnail:', error);
return undefined;
}
}
//# sourceMappingURL=thumbnailExtractor.js.map