classification.js
Version:
A powerful text classification library using Damerau-Levenshtein distance algorithm
257 lines (218 loc) • 10.5 kB
JavaScript
import fs from "fs";
import path from 'path';
/**
* @typedef {'Mini' | 'Core' | 'Pro' | 'Ultra'} AlgorithmLevel
*/
/**
* @class Classifier
* @classdesc A classifier that matches input texts to a provided dataset using specified options.
*/
class Classifier {
/**
* @constructor
* @param {Array} dataset - The pre-loaded dataset array for a specific language.
* @param {Object} [options={}] - Optional settings for the classifier.
* @param {boolean} [options.normalize=false] - Whether to normalize the text.
* @param {AlgorithmLevel} [options.algorithmLevel='Pro'] - The algorithm level ('Mini', 'Core', 'Pro', 'Ultra').
* @param {boolean} [options.keepLogToFile=false] - Whether to save classification results to a log file.
* @param {number} [options.truncateLength=2048] - The length to truncate texts for similarity calculation.
* @param {string} [options.language='unknown'] - Language code (for logging purposes).
* @param {number} [options.max_steps] - Max steps (calculated from algorithmLevel or provided).
*/
constructor(dataset, options = {}) {
if (!Array.isArray(dataset)) {
throw new Error("Classifier requires a dataset array.");
}
this.dataset = dataset;
this.options = { ...options }; // Copy options
this.language = this.options.language || 'unknown';
// Define algorithm levels and corresponding max_steps
const algorithmLevels = {
'Mini': 10,
'Core': 50,
'Pro': 200,
'Ultra': Infinity
};
// Ensure algorithmLevel is set
this.options.algorithmLevel = this.options.algorithmLevel || 'Pro';
// Determine max_steps based on level or provided value
this.max_steps = this.options.max_steps || algorithmLevels[this.options.algorithmLevel] || Infinity;
this.truncateLength = this.options.truncateLength || 2048;
// Normalize dataset
this.normalizedDataset = this.dataset.map(item => ({
...item,
normalizedText: this.options.normalize ? this.normalizeText(item.text) : item.text
}));
// Ensure options reflect the actual state being used
this.options.max_steps = this.max_steps;
this.options.truncateLength = this.truncateLength;
}
/**
* Loads a dataset from a JSON file.
* Static method, usable by index.js before creating a Classifier instance.
* @param {string} language - The language code to load (e.g., 'tur', 'eng').
* @returns {Array | null} The loaded dataset array or null on error.
*/
static loadDatasetFromFile(language) {
const filePath = path.resolve('datasets', `datas_${language}.json`);
try {
if (!fs.existsSync(filePath)) {
console.warn(`Warning: Dataset file not found for language '${language}' at ${filePath}. Returning empty dataset.`);
return []; // Return empty array if not found, model creation can proceed
}
const json = JSON.parse(fs.readFileSync(filePath, "utf-8"));
// Assuming the JSON structure is { "text": "label", ... }
return Object.entries(json).map(([text, label]) => ({ text, label }));
} catch (error) {
console.error(`Error loading dataset from file: ${filePath}`, error);
return null; // Return null on critical error (like parse error)
}
}
/**
* Calculates the similarity between two texts.
* @param {string} text1 - The first text.
* @param {string} text2 - The second text.
* @returns {Object} The similarity result.
*/
getSimilarity(text1, text2) {
const thisLength = text1.length;
const thatLength = text2.length;
if (Math.abs(thisLength - thatLength) > this.max_steps) {
return this.prepareSimilarityResult(this.max_steps, thisLength, thatLength);
}
let previousRow = Array(thatLength + 1).fill(0);
let currentRow = Array(thatLength + 1).fill(0);
for (let j = 0; j <= thatLength; j++) {
previousRow[j] = j;
}
for (let i = 1; i <= thisLength; i++) {
currentRow[0] = i;
for (let j = 1; j <= thatLength; j++) {
const cost = text1[i - 1] === text2[j - 1] ? 0 : 1;
currentRow[j] = Math.min(
previousRow[j] + 1,
currentRow[j - 1] + 1,
previousRow[j - 1] + cost
);
if (i > 1 && j > 1 && text1[i - 1] === text2[j - 2] && text1[i - 2] === text2[j - 1]) {
currentRow[j] = Math.min(currentRow[j], previousRow[j - 2] + cost);
}
if (currentRow[j] > this.max_steps) {
return this.prepareSimilarityResult(this.max_steps, thisLength, thatLength);
}
}
[previousRow, currentRow] = [currentRow, previousRow];
}
return this.prepareSimilarityResult(previousRow[thatLength], thisLength, thatLength);
}
/**
* Prepares the similarity result.
* @param {number} steps - The number of steps.
* @param {number} thisLength - The length of the first text.
* @param {number} thatLength - The length of the second text.
* @returns {Object} The prepared similarity result.
*/
prepareSimilarityResult(steps, thisLength, thatLength) {
const length = Math.max(thisLength, thatLength);
const relative = length === 0 ? 0 : (steps / length);
const similarity = 1 - relative;
return { steps, relative, similarity };
}
/**
* Normalizes the text.
* @param {string} text - The text to normalize.
* @returns {string} The normalized text.
*/
normalizeText(text) {
return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
}
/**
* Classifies the input texts.
* @param {Array<string>} inputs - The input texts to classify.
* @returns {Object} The classification results.
*/
classify(inputs) {
// Ensure inputs is an array
if (!Array.isArray(inputs)) {
console.error("Error: Input to classify must be an array of strings.");
return { success: false, error: "Input must be an array of strings.", results: [] };
}
const normalizedInputs = this.options.normalize ? inputs.map(this.normalizeText.bind(this)) : inputs;
const results = normalizedInputs.map((input, index) => this.findBestMatch(input, inputs[index]));
if (this.options.keepLogToFile) {
// Pass language explicitly if needed for logging
this.saveResultsToLog(results, this.language);
}
return {
success: true,
language: this.language, // Return language used by this instance
algorithmLevel: this.options.algorithmLevel,
labels: [...new Set(this.dataset.map(item => item.label))],
inputs,
normalize: this.options.normalize || false,
results
};
}
/**
* Finds the best match for an input text.
* @param {string} input - The normalized input text.
* @param {string} originalInput - The original input text.
* @returns {Object} The best match result.
*/
findBestMatch(input, originalInput) {
let bestMatch = { label: null, accuracy: 0, steps: Infinity, relative: 1, text: originalInput, totalSteps: Infinity };
const truncatedInput = input.length > this.truncateLength ? input.slice(0, this.truncateLength) : input;
for (const item of this.normalizedDataset) {
const truncatedText = item.normalizedText.length > this.truncateLength ? item.normalizedText.slice(0, this.truncateLength) : item.normalizedText;
const similarityResult = this.getSimilarity(truncatedText, truncatedInput);
if (this.isBetterMatch(similarityResult, bestMatch)) {
bestMatch = {
label: item.label,
input: originalInput,
accuracy: similarityResult.similarity,
steps: similarityResult.steps,
relative: similarityResult.relative,
text: item.text,
totalSteps: similarityResult.steps
};
}
}
return bestMatch;
}
/**
* Determines if a similarity result is a better match than the current best match.
* @param {Object} current - The current similarity result.
* @param {Object} best - The current best match.
* @returns {boolean} Whether the current similarity result is a better match.
*/
isBetterMatch(current, best) {
return current.similarity > best.accuracy ||
(current.similarity === best.accuracy && current.steps < best.steps) ||
(current.similarity === best.accuracy && current.steps === best.steps && current.relative > best.relative);
}
/**
* Saves the classification results to a log file.
* @param {Array} results - The classification results.
* @param {string} language - The language code for the log file name.
*/
saveResultsToLog(results, language) {
const logDir = path.resolve('plxjs');
const logFilePath = path.join(logDir, `log_${language}.json`); // Use provided language
try {
if (!fs.existsSync(logDir)) {
fs.mkdirSync(logDir, { recursive: true });
}
const logData = fs.existsSync(logFilePath) ?
JSON.parse(fs.readFileSync(logFilePath, 'utf-8')) : {};
results.forEach(result => {
if (result && result.input) {
logData[result.input] = result.label;
}
});
fs.writeFileSync(logFilePath, JSON.stringify(logData, null, 2), 'utf8');
} catch (error) {
console.error(`Error saving results to log for language '${language}':`, error);
}
}
}
export default Classifier;