@plust/datasleuth
Version:
Build LLM-powered research pipelines and output structured data.
255 lines • 11.1 kB
JavaScript
/**
* Entity classification and clustering step implementation
* This module provides advanced entity recognition, classification and clustering
* capabilities to organize research findings into a coherent knowledge graph.
*/
import { createStep } from '../utils/steps.js';
import { ValidationError, ConfigurationError, LLMError, ProcessingError } from '../types/errors.js';
import { createStepLogger } from '../utils/logging.js';
import { executeWithRetry } from '../utils/retry.js';
/**
* Execute the entity classification step
* @param state - Current research state
* @param options - Classification options
* @returns Updated research state with classification data
*/
async function executeClassifyStep(state, options) {
const stepLogger = createStepLogger('Classification');
const { classifyEntities = true, clusterEntities = true, confidenceThreshold = 0.6, customEntityTypes = [], maxEntities = 50, maxClusters = 10, clusteringInstructions = '', retry = { maxRetries: 2, baseDelay: 1000 }, } = options;
stepLogger.info('Starting entity classification and clustering');
try {
// Validate confidence threshold
if (confidenceThreshold < 0 || confidenceThreshold > 1) {
throw new ValidationError({
message: `Invalid confidence threshold: ${confidenceThreshold}. Must be between 0 and 1.`,
step: 'Classification',
details: { confidenceThreshold },
suggestions: [
'Confidence threshold must be between 0.0 and 1.0',
'Recommended values are between 0.5 and 0.8',
],
});
}
// Ensure we have data to classify
if (!state.data.extractedContent || state.data.extractedContent.length === 0) {
stepLogger.warn('No content available for classification');
return {
...state,
metadata: {
...state.metadata,
warnings: [
...(state.metadata.warnings || []),
'Classification step skipped due to missing content.',
],
},
};
}
stepLogger.debug(`Classifying content with ${clusterEntities ? 'clustering enabled' : 'clustering disabled'}`);
// In a real implementation, this would use an LLM to classify entities
// For now, we'll simulate the classification process with error handling
const classificationData = await executeWithRetry(() => simulateEntityClassification(state.data.extractedContent, // Safe to use ! here since we checked above
state.query, {
classifyEntities,
clusterEntities,
confidenceThreshold,
customEntityTypes,
maxEntities,
maxClusters,
clusteringInstructions,
retry,
}), {
maxRetries: retry.maxRetries ?? 2,
retryDelay: retry.baseDelay ?? 1000,
backoffFactor: 2,
onRetry: (attempt, error, delay) => {
stepLogger.warn(`Retry attempt ${attempt} for classification: ${error instanceof Error ? error.message : 'Unknown error'}. Retrying in ${delay}ms...`);
},
});
stepLogger.info(`Classification completed with ${Object.keys(classificationData.entities).length} entities, ${classificationData.relationships.length} relationships, and ${Object.keys(classificationData.clusters).length} clusters`);
return {
...state,
data: {
...state.data,
classification: classificationData,
},
metadata: {
...state.metadata,
hasClassification: true,
entityCount: Object.keys(classificationData.entities).length,
clusterCount: Object.keys(classificationData.clusters).length,
relationshipCount: classificationData.relationships.length,
},
};
}
catch (error) {
// Handle different error types appropriately
if (error instanceof ValidationError ||
error instanceof LLMError ||
error instanceof ConfigurationError) {
// These are already properly formatted, just throw them
throw error;
}
// Handle generic errors
const errorMessage = error instanceof Error ? error.message : String(error);
stepLogger.error(`Error during classification: ${errorMessage}`);
// Check for specific error patterns
if (errorMessage.includes('JSON') || errorMessage.includes('parse')) {
throw new LLMError({
message: `Failed to parse LLM response during classification: ${errorMessage}`,
step: 'Classification',
details: { error },
retry: true,
suggestions: [
'The LLM response could not be properly parsed',
'Try a different model or temperature setting',
'Check if the prompt is properly formatted for structured output',
],
});
}
else if (errorMessage.includes('rate limit') || errorMessage.includes('quota')) {
throw new LLMError({
message: `LLM rate limit exceeded during classification: ${errorMessage}`,
step: 'Classification',
details: { error },
retry: true,
suggestions: [
'Wait and try again later',
'Consider using a different LLM provider',
'Implement rate limiting in your application',
],
});
}
// Generic processing error
throw new ProcessingError({
message: `Classification failed: ${errorMessage}`,
step: 'Classification',
details: { error, options },
retry: true,
suggestions: [
'Check your classification configuration',
'Try with a smaller set of content',
'Reduce the complexity of clustering requirements',
],
});
}
}
/**
* Simulate entity classification using an LLM
* This will be replaced with an actual implementation using mastra and the ai SDK
*/
async function simulateEntityClassification(extractedContent, // Changed from 'content: string[]' to accept ExtractedContent[]
query, options) {
// Simulate processing time
await new Promise((resolve) => setTimeout(resolve, 1200));
// Extract actual text content from ExtractedContent objects
const textContent = extractedContent.map((item) => item.content || '').filter(Boolean);
// Sample entities based on common topics - will be replaced with real LLM-based extraction
const entities = {};
const relationships = [];
const clusters = {};
// Generate some sample entities based on the query
// In a real implementation, these would be extracted from the content
const queryWords = query.toLowerCase().split(' ');
if (queryWords.includes('space') || queryWords.includes('exploration')) {
entities['nasa'] = {
name: 'NASA',
type: 'organization',
description: "The National Aeronautics and Space Administration is America's civil space program and the global leader in space exploration.",
confidence: 0.95,
mentions: 12,
};
entities['spacex'] = {
name: 'SpaceX',
type: 'company',
description: 'Space Exploration Technologies Corp. is an American spacecraft manufacturer, space launch provider, and satellite communications company.',
confidence: 0.92,
mentions: 8,
};
entities['mars'] = {
name: 'Mars',
type: 'celestial_body',
description: 'The fourth planet from the Sun and the second-smallest planet in the Solar System.',
confidence: 0.89,
mentions: 7,
};
// Add some relationships
relationships.push({
source: 'nasa',
target: 'mars',
relationship: 'explores',
confidence: 0.88,
}, {
source: 'spacex',
target: 'mars',
relationship: 'targets for exploration',
confidence: 0.86,
});
// Add a cluster
clusters['space_exploration'] = {
name: 'Space Exploration',
description: 'Organizations and targets involved in space exploration efforts',
entities: ['nasa', 'spacex', 'mars'],
confidence: 0.9,
};
}
if (queryWords.includes('climate') || queryWords.includes('environment')) {
entities['climate_change'] = {
name: 'Climate Change',
type: 'concept',
description: 'Long-term shifts in temperatures and weather patterns, primarily caused by human activities.',
confidence: 0.93,
mentions: 15,
};
entities['ipcc'] = {
name: 'IPCC',
type: 'organization',
description: 'The Intergovernmental Panel on Climate Change, the United Nations body for assessing the science related to climate change.',
confidence: 0.91,
mentions: 6,
};
// Add relationships
relationships.push({
source: 'ipcc',
target: 'climate_change',
relationship: 'studies',
confidence: 0.92,
});
// Add a cluster
clusters['climate_research'] = {
name: 'Climate Research',
description: 'Organizations and concepts related to climate science',
entities: ['climate_change', 'ipcc'],
confidence: 0.88,
};
}
// Include custom entity types if provided
if (options.customEntityTypes && options.customEntityTypes.length > 0) {
// This would normally extract entities of the specified types from the content
// For now, just add a note in the metadata
console.log(`Would extract custom entity types: ${options.customEntityTypes.join(', ')}`);
}
return {
entities,
relationships,
clusters,
};
}
/**
* Create a classification step
* @param options - Classification options
* @returns A configured classification step
*/
export function classify(options = {}) {
return createStep('Classify',
// Wrapper function that matches the expected signature
async (state, opts) => {
return executeClassifyStep(state, options);
}, options, {
// Mark as retryable by default for the entire step
retryable: true,
maxRetries: options.retry?.maxRetries || 2,
retryDelay: options.retry?.baseDelay || 1000,
backoffFactor: 2,
});
}
//# sourceMappingURL=classify.js.map