UNPKG

classification.js

Version:

A powerful text classification library using Damerau-Levenshtein distance algorithm

257 lines (218 loc) • 10.5 kB

JavaScript

import fs from "fs"; import path from 'path'; /** * @typedef {'Mini' | 'Core' | 'Pro' | 'Ultra'} AlgorithmLevel */ /** * @class Classifier * @classdesc A classifier that matches input texts to a provided dataset using specified options. */ class Classifier { /** * @constructor * @param {Array} dataset - The pre-loaded dataset array for a specific language. * @param {Object} [options={}] - Optional settings for the classifier. * @param {boolean} [options.normalize=false] - Whether to normalize the text. * @param {AlgorithmLevel} [options.algorithmLevel='Pro'] - The algorithm level ('Mini', 'Core', 'Pro', 'Ultra'). * @param {boolean} [options.keepLogToFile=false] - Whether to save classification results to a log file. * @param {number} [options.truncateLength=2048] - The length to truncate texts for similarity calculation. * @param {string} [options.language='unknown'] - Language code (for logging purposes). * @param {number} [options.max_steps] - Max steps (calculated from algorithmLevel or provided). */ constructor(dataset, options = {}) { if (!Array.isArray(dataset)) { throw new Error("Classifier requires a dataset array."); } this.dataset = dataset; this.options = { ...options }; // Copy options this.language = this.options.language || 'unknown'; // Define algorithm levels and corresponding max_steps const algorithmLevels = { 'Mini': 10, 'Core': 50, 'Pro': 200, 'Ultra': Infinity }; // Ensure algorithmLevel is set this.options.algorithmLevel = this.options.algorithmLevel || 'Pro'; // Determine max_steps based on level or provided value this.max_steps = this.options.max_steps || algorithmLevels[this.options.algorithmLevel] || Infinity; this.truncateLength = this.options.truncateLength || 2048; // Normalize dataset this.normalizedDataset = this.dataset.map(item => ({ ...item, normalizedText: this.options.normalize ? this.normalizeText(item.text) : item.text })); // Ensure options reflect the actual state being used this.options.max_steps = this.max_steps; this.options.truncateLength = this.truncateLength; } /** * Loads a dataset from a JSON file. * Static method, usable by index.js before creating a Classifier instance. * @param {string} language - The language code to load (e.g., 'tur', 'eng'). * @returns {Array | null} The loaded dataset array or null on error. */ static loadDatasetFromFile(language) { const filePath = path.resolve('datasets', `datas_${language}.json`); try { if (!fs.existsSync(filePath)) { console.warn(`Warning: Dataset file not found for language '${language}' at ${filePath}. Returning empty dataset.`); return []; // Return empty array if not found, model creation can proceed } const json = JSON.parse(fs.readFileSync(filePath, "utf-8")); // Assuming the JSON structure is { "text": "label", ... } return Object.entries(json).map(([text, label]) => ({ text, label })); } catch (error) { console.error(`Error loading dataset from file: ${filePath}`, error); return null; // Return null on critical error (like parse error) } } /** * Calculates the similarity between two texts. * @param {string} text1 - The first text. * @param {string} text2 - The second text. * @returns {Object} The similarity result. */ getSimilarity(text1, text2) { const thisLength = text1.length; const thatLength = text2.length; if (Math.abs(thisLength - thatLength) > this.max_steps) { return this.prepareSimilarityResult(this.max_steps, thisLength, thatLength); } let previousRow = Array(thatLength + 1).fill(0); let currentRow = Array(thatLength + 1).fill(0); for (let j = 0; j <= thatLength; j++) { previousRow[j] = j; } for (let i = 1; i <= thisLength; i++) { currentRow[0] = i; for (let j = 1; j <= thatLength; j++) { const cost = text1[i - 1] === text2[j - 1] ? 0 : 1; currentRow[j] = Math.min( previousRow[j] + 1, currentRow[j - 1] + 1, previousRow[j - 1] + cost ); if (i > 1 && j > 1 && text1[i - 1] === text2[j - 2] && text1[i - 2] === text2[j - 1]) { currentRow[j] = Math.min(currentRow[j], previousRow[j - 2] + cost); } if (currentRow[j] > this.max_steps) { return this.prepareSimilarityResult(this.max_steps, thisLength, thatLength); } } [previousRow, currentRow] = [currentRow, previousRow]; } return this.prepareSimilarityResult(previousRow[thatLength], thisLength, thatLength); } /** * Prepares the similarity result. * @param {number} steps - The number of steps. * @param {number} thisLength - The length of the first text. * @param {number} thatLength - The length of the second text. * @returns {Object} The prepared similarity result. */ prepareSimilarityResult(steps, thisLength, thatLength) { const length = Math.max(thisLength, thatLength); const relative = length === 0 ? 0 : (steps / length); const similarity = 1 - relative; return { steps, relative, similarity }; } /** * Normalizes the text. * @param {string} text - The text to normalize. * @returns {string} The normalized text. */ normalizeText(text) { return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase(); } /** * Classifies the input texts. * @param {Array<string>} inputs - The input texts to classify. * @returns {Object} The classification results. */ classify(inputs) { // Ensure inputs is an array if (!Array.isArray(inputs)) { console.error("Error: Input to classify must be an array of strings."); return { success: false, error: "Input must be an array of strings.", results: [] }; } const normalizedInputs = this.options.normalize ? inputs.map(this.normalizeText.bind(this)) : inputs; const results = normalizedInputs.map((input, index) => this.findBestMatch(input, inputs[index])); if (this.options.keepLogToFile) { // Pass language explicitly if needed for logging this.saveResultsToLog(results, this.language); } return { success: true, language: this.language, // Return language used by this instance algorithmLevel: this.options.algorithmLevel, labels: [...new Set(this.dataset.map(item => item.label))], inputs, normalize: this.options.normalize || false, results }; } /** * Finds the best match for an input text. * @param {string} input - The normalized input text. * @param {string} originalInput - The original input text. * @returns {Object} The best match result. */ findBestMatch(input, originalInput) { let bestMatch = { label: null, accuracy: 0, steps: Infinity, relative: 1, text: originalInput, totalSteps: Infinity }; const truncatedInput = input.length > this.truncateLength ? input.slice(0, this.truncateLength) : input; for (const item of this.normalizedDataset) { const truncatedText = item.normalizedText.length > this.truncateLength ? item.normalizedText.slice(0, this.truncateLength) : item.normalizedText; const similarityResult = this.getSimilarity(truncatedText, truncatedInput); if (this.isBetterMatch(similarityResult, bestMatch)) { bestMatch = { label: item.label, input: originalInput, accuracy: similarityResult.similarity, steps: similarityResult.steps, relative: similarityResult.relative, text: item.text, totalSteps: similarityResult.steps }; } } return bestMatch; } /** * Determines if a similarity result is a better match than the current best match. * @param {Object} current - The current similarity result. * @param {Object} best - The current best match. * @returns {boolean} Whether the current similarity result is a better match. */ isBetterMatch(current, best) { return current.similarity > best.accuracy || (current.similarity === best.accuracy && current.steps < best.steps) || (current.similarity === best.accuracy && current.steps === best.steps && current.relative > best.relative); } /** * Saves the classification results to a log file. * @param {Array} results - The classification results. * @param {string} language - The language code for the log file name. */ saveResultsToLog(results, language) { const logDir = path.resolve('plxjs'); const logFilePath = path.join(logDir, `log_${language}.json`); // Use provided language try { if (!fs.existsSync(logDir)) { fs.mkdirSync(logDir, { recursive: true }); } const logData = fs.existsSync(logFilePath) ? JSON.parse(fs.readFileSync(logFilePath, 'utf-8')) : {}; results.forEach(result => { if (result && result.input) { logData[result.input] = result.label; } }); fs.writeFileSync(logFilePath, JSON.stringify(logData, null, 2), 'utf8'); } catch (error) { console.error(`Error saving results to log for language '${language}':`, error); } } } export default Classifier;