bilingual-summarizer
Version:
A powerful text summarization package for Arabic and English content with sentiment analysis and topic extraction
216 lines (215 loc) • 8.29 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.summarizeArabicText = summarizeArabicText;
const languageDetection_1 = require("../utils/languageDetection");
// Import available Arabic libraries
let arabicNLP = null;
let arabicStrings = null;
let arabicReshaper = null;
try {
arabicNLP = require('arabic-nlp');
}
catch (error) {
// Optional dependency not installed
}
try {
arabicStrings = require('@flowdegree/arabic-strings');
}
catch (error) {
// Optional dependency not installed
}
try {
arabicReshaper = require('arabic-persian-reshaper');
}
catch (error) {
// Optional dependency not installed
}
/**
* Summarizes Arabic text using available Arabic NLP libraries
* @param text The text to summarize
* @param sentenceCount The number of sentences to include in the summary
* @returns A summarized version of the text
*/
function summarizeArabicText(text, sentenceCount = 5) {
// Check if text is Arabic
if (!(0, languageDetection_1.isArabic)(text)) {
throw new Error('The provided text is not in Arabic');
}
// Check if we have any Arabic NLP tools available
const hasArabicNLP = !!arabicNLP;
const hasArabicStrings = !!arabicStrings;
const hasArabicReshaper = !!arabicReshaper;
try {
// Extract sentences
const sentences = extractArabicSentences(text);
if (sentences.length <= sentenceCount) {
return text; // Text is already short enough
}
// Score sentences
const scoredSentences = scoreArabicSentences(sentences, hasArabicNLP, hasArabicStrings);
// Sort sentences by score
const sortedSentences = [...scoredSentences].sort((a, b) => b.score - a.score);
// Select top sentences and maintain original order
const topSentenceIndices = sortedSentences
.slice(0, sentenceCount)
.map(s => s.index)
.sort((a, b) => a - b);
// Generate summary
const summary = topSentenceIndices.map(i => sentences[i]).join(' ');
// Apply Arabic reshaping if available
if (hasArabicReshaper) {
try {
// Use the proper API: PersianShaper.convertArabic or ArabicShaper.convertArabic
if (arabicReshaper.PersianShaper && arabicReshaper.PersianShaper.convertArabic) {
return arabicReshaper.PersianShaper.convertArabic(summary);
}
else if (arabicReshaper.ArabicShaper && arabicReshaper.ArabicShaper.convertArabic) {
return arabicReshaper.ArabicShaper.convertArabic(summary);
}
else {
// Fallback if the expected API is not found
return summary;
}
}
catch (error) {
console.error('Error using Arabic reshaper:', error);
return summary;
}
}
return summary;
}
catch (error) {
console.error('Error in Arabic summarization:', error);
// Fallback to simple extraction of first few sentences
return extractArabicSentences(text).slice(0, sentenceCount).join(' ');
}
}
/**
* Extracts Arabic sentences from text
* @param text The text to extract sentences from
* @returns An array of sentences
*/
function extractArabicSentences(text) {
// Arabic sentence endings: periods, question marks, exclamation marks
const sentenceEndMarkers = ['.', '?', '!', '؟', '!', ':', '؛', '\n\n'];
let sentences = [];
let currentSentence = '';
for (let i = 0; i < text.length; i++) {
currentSentence += text[i];
if (sentenceEndMarkers.includes(text[i]) &&
(i === text.length - 1 || text[i + 1] === ' ' || text[i + 1] === '\n')) {
sentences.push(currentSentence.trim());
currentSentence = '';
}
}
// Add any remaining text as a sentence
if (currentSentence.trim().length > 0) {
sentences.push(currentSentence.trim());
}
return sentences.filter(s => s.trim().length > 0);
}
/**
* Scores Arabic sentences for importance
* @param sentences Array of Arabic sentences
* @param hasArabicNLP Whether the arabic-nlp library is available
* @param hasArabicStrings Whether the @flowdegree/arabic-strings library is available
* @returns Array of scored sentences
*/
function scoreArabicSentences(sentences, hasArabicNLP, hasArabicStrings) {
// Calculate word frequencies
const wordFrequencies = calculateArabicWordFrequencies(sentences, hasArabicStrings);
// Score each sentence
return sentences.map((sentence, index) => {
let score = 0;
// Basic scoring by word frequency
const words = sentence.split(/\s+/);
words.forEach(word => {
score += wordFrequencies[word] || 0;
});
// Normalize by sentence length to avoid favoring long sentences too much
score = words.length > 0 ? score / Math.sqrt(words.length) : 0;
// Position scoring - first and last sentences are usually important
if (index === 0 || index === sentences.length - 1) {
score *= 1.25;
}
// Boost sentences with common Arabic indicator phrases
const indicatorPhrases = [
'في الختام',
'من أهم',
'بشكل أساسي',
'يعتبر',
'الهدف الرئيسي',
'من الضروري',
'يجب أن نلاحظ',
'تبين أن'
];
for (const phrase of indicatorPhrases) {
if (sentence.includes(phrase)) {
score *= 1.3;
break;
}
}
// Additional scoring using arabic-nlp if available
if (hasArabicNLP) {
try {
// Use available functions from arabic-nlp
// This is a placeholder for actual implementation
const importance = arabicNLP.getImportance ? arabicNLP.getImportance(sentence) : 0;
score += importance * 0.5;
}
catch (error) {
// Ignore errors in optional library
}
}
return { index, score };
});
}
/**
* Calculates word frequencies in Arabic text
* @param sentences Array of Arabic sentences
* @param hasArabicStrings Whether the @flowdegree/arabic-strings library is available
* @returns Object mapping words to their frequencies
*/
function calculateArabicWordFrequencies(sentences, hasArabicStrings) {
const frequencies = {};
const stopWords = getArabicStopWords();
sentences.forEach(sentence => {
let words;
if (hasArabicStrings && arabicStrings.tokenize) {
try {
// Use the library's tokenizer if available
words = arabicStrings.tokenize(sentence);
}
catch (error) {
words = sentence.split(/\s+/);
}
}
else {
words = sentence.split(/\s+/);
}
words.forEach(word => {
const normalizedWord = word.trim().toLowerCase();
// Skip stop words and very short words
if (normalizedWord.length < 2 || stopWords.includes(normalizedWord)) {
return;
}
frequencies[normalizedWord] = (frequencies[normalizedWord] || 0) + 1;
});
});
return frequencies;
}
/**
* Gets a list of common Arabic stop words
* @returns Array of Arabic stop words
*/
function getArabicStopWords() {
// Common Arabic stop words
return [
'من', 'إلى', 'عن', 'على', 'في', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
'أنا', 'أنت', 'هو', 'هي', 'نحن', 'هم', 'كان', 'كانت', 'يكون', 'أن',
'لا', 'ما', 'و', 'أو', 'ثم', 'إن', 'إذا', 'حتى', 'قد', 'لقد',
'جدا', 'فقط', 'كل', 'بعض', 'مثل', 'عندما', 'كيف', 'لماذا', 'متى',
'أين', 'لكن', 'كما', 'بعد', 'قبل', 'خلال', 'منذ', 'بين', 'يا',
'ولكن', 'لذلك', 'بل', 'بينما', 'الذي', 'التي', 'الذين', 'اللواتي'
];
}