bilingual-summarizer
Version:
A powerful text summarization package for Arabic and English content with sentiment analysis and topic extraction
273 lines (272 loc) • 10.8 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.summarizeText = summarizeText;
const SummarizerManager = __importStar(require("node-summarizer"));
const languageDetection_1 = require("../utils/languageDetection");
const textPreprocessing_1 = require("../utils/textPreprocessing");
const topicExtractor_1 = require("./topicExtractor");
const arabicSummarizer_1 = require("./arabicSummarizer");
/**
* Generates a summary of the given text
* @param text The text to summarize
* @param sentenceCount The number of sentences to include in the summary
* @returns A summarized version of the text
*/
function summarizeText(text, sentenceCount = 5) {
try {
if (!text || typeof text !== 'string' || text.trim().length === 0) {
return '';
}
// Check if the text is in Arabic
const isArabicText = (0, languageDetection_1.isArabic)(text);
// If the text is Arabic, use the specialized Arabic summarizer
if (isArabicText) {
return (0, arabicSummarizer_1.summarizeArabicText)(text, sentenceCount);
}
// For English and other languages, use the general method
// Extract sentences from the text
const sentences = (0, textPreprocessing_1.extractSentences)(text);
// If there are fewer sentences than the requested count, return the original text
if (sentences.length <= sentenceCount) {
return text;
}
// Calculate sentence scores
const scoredSentences = scoreEnglishSentences(sentences);
// Sort sentences by score in descending order
const sortedSentences = [...scoredSentences].sort((a, b) => b.score - a.score);
// Take the top N sentences
const topSentences = sortedSentences.slice(0, sentenceCount);
// Sort the top sentences by their original position to maintain flow
topSentences.sort((a, b) => a.index - b.index);
// Combine the top sentences
const summary = topSentences.map(s => s.text).join(' ');
return summary;
}
catch (error) {
console.error('Summarization error:', error);
// Return a truncated version of the original text as fallback
const truncated = text.split('.').slice(0, sentenceCount).join('.').trim();
return truncated || text;
}
}
/**
* Scores English sentences based on their importance
* @param sentences Array of sentences
* @returns Array of sentences with scores
*/
function scoreEnglishSentences(sentences) {
// Word frequency map
const wordFrequency = {};
// Calculate word frequency across all sentences
sentences.forEach(sentence => {
// Split sentence into words and convert to lowercase
const words = sentence.toLowerCase().match(/\b\w+\b/g) || [];
// Count word frequency
words.forEach(word => {
// Skip very short words (likely not meaningful)
if (word.length <= 1)
return;
wordFrequency[word] = (wordFrequency[word] || 0) + 1;
});
});
// Score each sentence based on word frequency and length
return sentences.map((text, index) => {
const words = text.toLowerCase().match(/\b\w+\b/g) || [];
// Skip very short sentences
if (words.length < 3) {
return { text, score: 0, index };
}
// Calculate score based on word frequency
let score = words.reduce((total, word) => {
if (word.length > 1) {
return total + (wordFrequency[word] || 0);
}
return total;
}, 0);
// Normalize score by sentence length to avoid bias towards longer sentences
score = score / (words.length || 1);
// Boost score for sentences that appear at the beginning or end (often more important)
if (index === 0 || index === sentences.length - 1) {
score *= 1.25;
}
else if (index < sentences.length * 0.1) {
// Boost sentences in the first 10% of the text
score *= 1.1;
}
return { text, score, index };
});
}
/**
* Scores Arabic sentences based on their importance
* @param sentences Array of sentences
* @returns Array of sentences with scores
*/
function scoreArabicSentences(sentences) {
// Word frequency map for Arabic
const wordFrequency = {};
// Calculate word frequency across all sentences
sentences.forEach(sentence => {
// Split sentence into words
const words = sentence.split(/\s+/);
// Count word frequency
words.forEach(word => {
// Skip very short words
if (word.length <= 1)
return;
wordFrequency[word] = (wordFrequency[word] || 0) + 1;
});
});
// Score each sentence
return sentences.map((text, index) => {
const words = text.split(/\s+/);
// Skip very short sentences
if (words.length < 2) {
return { text, score: 0, index };
}
// Calculate score based on word frequency
let score = words.reduce((total, word) => {
if (word.length > 1) {
return total + (wordFrequency[word] || 0);
}
return total;
}, 0);
// Normalize score by sentence length
score = score / (words.length || 1);
// Boost score for sentences that appear at the beginning or end
if (index === 0 || index === sentences.length - 1) {
score *= 1.25;
}
else if (index < sentences.length * 0.1) {
// Boost sentences in the first 10% of the text
score *= 1.1;
}
// Add special weighting for sentences containing important Arabic terms
// These are indicators of important information in Arabic texts
const importantTerms = ['من أهم', 'يجب', 'ضروري', 'أساسي', 'مهم', 'خلاصة', 'نتيجة', 'إنّ', 'إن'];
for (const term of importantTerms) {
if (text.includes(term)) {
score *= 1.15; // Boost score by 15%
break;
}
}
return { text, score, index };
});
}
/**
* Summarizes English text using node-summarizer
* @param text The English text to summarize
* @param sentenceCount The number of sentences for the summary
* @returns The summarized text
*/
function summarizeEnglishText(text, sentenceCount) {
try {
// Use node-summarizer for English text
const summarizer = new SummarizerManager({
text,
sentences: sentenceCount
});
const result = summarizer.getSummaryByFrequency();
if (result && result.summary) {
return result.summary;
}
// Fallback to our custom summarization if node-summarizer fails
return customSummarize(text, sentenceCount);
}
catch (error) {
console.error('English summarization error:', error);
return customSummarize(text, sentenceCount);
}
}
/**
* Summarizes Arabic text using custom implementation
* @param text The Arabic text to summarize
* @param sentenceCount The number of sentences for the summary
* @returns The summarized text
*/
function summarizeArabicText(text, sentenceCount) {
// Use custom summarizer for Arabic text
return customSummarize(text, sentenceCount);
}
/**
* Custom implementation of text summarization that works for both Arabic and English
* @param text The text to summarize
* @param sentenceCount The number of sentences for the summary
* @returns The summarized text
*/
function customSummarize(text, sentenceCount) {
// Clean the text
const cleanedText = (0, textPreprocessing_1.cleanText)(text);
// Extract sentences
const sentences = (0, textPreprocessing_1.extractSentences)(cleanedText);
// If too few sentences, return the original text
if (sentences.length <= sentenceCount) {
return cleanedText;
}
// Extract the main topics to use for scoring sentences
const topics = (0, topicExtractor_1.extractTopics)(cleanedText, 10);
const topicSet = new Set(topics);
// Score each sentence based on its importance
const scoredSentences = sentences.map((sentence, index) => {
// Calculate a score based on several factors
let score = 0;
// Position bias: earlier sentences often contain important information
score += (1.0 - (index / sentences.length)) * 0.1;
// Topic relevance: sentences containing important topics get higher scores
const words = sentence.split(/\s+/);
for (const word of words) {
if (topicSet.has(word.toLowerCase())) {
score += 0.3;
}
}
// Length preference: avoid very short sentences
if (words.length > 5) {
score += 0.1;
}
return {
text: sentence,
score,
index
};
});
// Sort sentences by score in descending order
const topSentences = scoredSentences
.sort((a, b) => b.score - a.score)
.slice(0, sentenceCount);
// Sort selected sentences by original position to maintain narrative flow
topSentences.sort((a, b) => a.index - b.index);
// Join the sentences to create the summary
return topSentences.map(sentence => sentence.text).join(' ');
}