UNPKG

bilingual-summarizer

Version:

A powerful text summarization package for Arabic and English content with sentiment analysis and topic extraction

137 lines (136 loc) 6.69 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.extractTopics = extractTopics; exports.calculateDifficulty = calculateDifficulty; const natural = __importStar(require("natural")); const languageDetection_1 = require("../utils/languageDetection"); // Stop words for English and Arabic const ENGLISH_STOP_WORDS = new Set([ 'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down', 'of', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'this', 'that', 'these', 'those', 'what', 'which' ]); const ARABIC_STOP_WORDS = new Set([ 'من', 'إلى', 'عن', 'على', 'في', 'هو', 'هي', 'هم', 'انت', 'انتم', 'انتن', 'انا', 'نحن', 'هذا', 'هذه', 'ذلك', 'تلك', 'هؤلاء', 'هناك', 'الذي', 'التي', 'الذين', 'اللواتي', 'ما', 'ماذا', 'كيف', 'متى', 'لماذا', 'من', 'أين', 'و', 'أو', 'ثم', 'لكن', 'بل', 'إن', 'إذا', 'حتى', 'كان', 'كانت', 'كانوا', 'يكون', 'تكون', 'يكونوا', 'كن', 'كنت', 'كنتم', 'قد', 'لا', 'لم', 'لن', 'مع', 'عند', 'عندما', 'فوق', 'تحت', 'بين', 'بعد', 'قبل', 'كل', 'بعض', 'أكثر', 'أقل', 'آخر', 'غير', 'أن' ]); // Stemmer initialization const englishStemmer = natural.PorterStemmer; // Common topics in English const commonEnglishTopics = [ 'Technology', 'Science', 'Health', 'Politics', 'Business', 'Economy', 'Entertainment', 'Sports', 'Education', 'Environment', 'Art', 'Music', 'Food', 'Travel', 'Fashion', 'Lifestyle', 'Religion', 'History', 'JavaScript', 'Programming', 'Web', 'Development', 'Software', 'Data', 'AI', 'Machine Learning', 'Blockchain', 'Cryptocurrency', 'Finance' ]; // Common topics in Arabic (with English translations) const commonArabicTopics = [ 'تكنولوجيا', 'علوم', 'صحة', 'سياسة', 'أعمال', 'اقتصاد', 'ترفيه', 'رياضة', 'تعليم', 'بيئة', 'فن', 'موسيقى', 'طعام', 'سفر', 'أزياء', 'نمط الحياة', 'دين', 'تاريخ', 'جافاسكريبت', 'برمجة', 'ويب', 'تطوير', 'برمجيات', 'بيانات', 'ذكاء اصطناعي', 'تعلم آلي', 'بلوكتشين', 'عملات رقمية', 'مالية' ]; /** * Extracts topics from the given text * @param text The text to extract topics from * @param limit Optional limit for the number of topics to return (default: 5) * @returns An array of extracted topics */ function extractTopics(text, limit = 5) { try { if (!text || text.trim().length === 0) { return []; } // Determine if the text is in Arabic const isArabicText = (0, languageDetection_1.isArabic)(text); // Use the appropriate topic list const relevantTopics = isArabicText ? commonArabicTopics : commonEnglishTopics; // Convert text to lowercase for matching const lowerText = text.toLowerCase(); // Find topics that are mentioned in the text (case insensitive) const foundTopics = relevantTopics.filter(topic => { const lowerTopic = topic.toLowerCase(); return lowerText.includes(lowerTopic); }); // Limit to specified number of topics return foundTopics.slice(0, limit); } catch (error) { console.error('Topic extraction error:', error); return []; } } /** * Calculates the difficulty score of the text based on word length, sentence complexity, etc. * @param text The text to analyze * @returns A difficulty score between 0 (easy) and 1 (difficult) */ function calculateDifficulty(text) { try { if (!text || text.trim().length === 0) { return 0.5; // Default to medium difficulty } // Calculate average word length const words = text.split(/\s+/).filter(Boolean); const avgWordLength = words.reduce((sum, word) => sum + word.length, 0) / words.length; // Calculate average sentence length const sentences = text.split(/[.!?]+/).filter(Boolean); const avgSentenceLength = words.length / sentences.length; // Calculate complexity metrics const complexWords = words.filter(word => word.length > 6).length; const complexityRatio = complexWords / words.length; // Calculate overall difficulty score (normalized between 0 and 1) let difficultyScore = ((avgWordLength / 10) * 0.3 + (avgSentenceLength / 30) * 0.3 + complexityRatio * 0.4); // Ensure the score is between 0 and 1 return Math.max(0, Math.min(1, difficultyScore)); } catch (error) { console.error('Difficulty calculation error:', error); return 0.5; // Default to medium difficulty on error } }