UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

289 lines (249 loc) 6.84 kB
export interface QueryProcessingConfig { enableSpellCheck?: boolean; enableSynonymExpansion?: boolean; enableStopWordRemoval?: boolean; minQueryLength?: number; maxQueryLength?: number; } export interface ProcessedQuery { original: string; processed: string; keywords: string[]; entities?: string[]; intent?: string; language?: string; } export class QueryProcessor { private config: QueryProcessingConfig; private stopWords!: Set<string>; private synonyms!: Map<string, string[]>; constructor(config: QueryProcessingConfig = {}) { this.config = { enableSpellCheck: false, enableSynonymExpansion: true, enableStopWordRemoval: true, minQueryLength: 3, maxQueryLength: 500, ...config, }; this.initializeStopWords(); this.initializeSynonyms(); } async processQuery(query: string): Promise<ProcessedQuery> { if (!this.isValidQuery(query)) { throw new Error("Invalid query: too short or too long"); } let processed = query.trim(); // Step 1: Normalize text processed = this.normalizeText(processed); // Step 2: Remove stop words (optional) if (this.config.enableStopWordRemoval) { processed = this.removeStopWords(processed); } // Step 3: Extract keywords const keywords = this.extractKeywords(processed); // Step 4: Expand with synonyms (optional) if (this.config.enableSynonymExpansion) { processed = this.expandWithSynonyms(processed); } // Step 5: Detect language const language = this.detectLanguage(query); // Step 6: Extract entities (basic implementation) const entities = this.extractEntities(processed); // Step 7: Determine intent (basic implementation) const intent = this.determineIntent(query); return { original: query, processed: processed.trim(), keywords, entities, intent, language, }; } private isValidQuery(query: string): boolean { const length = query.trim().length; return ( length >= (this.config.minQueryLength || 3) && length <= (this.config.maxQueryLength || 500) ); } private normalizeText(text: string): string { return text .toLowerCase() .replace(/[^\w\s가-힣]/g, " ") // Keep alphanumeric and Korean characters .replace(/\s+/g, " ") .trim(); } private removeStopWords(text: string): string { const words = text.split(" "); const filteredWords = words.filter((word) => !this.stopWords.has(word)); return filteredWords.join(" "); } private extractKeywords(text: string): string[] { const words = text.split(" ").filter((word) => word.length > 2); // Simple keyword extraction - can be enhanced with TF-IDF or other methods const wordFreq = new Map<string, number>(); words.forEach((word) => { wordFreq.set(word, (wordFreq.get(word) || 0) + 1); }); // Return unique words sorted by frequency return Array.from(wordFreq.entries()) .sort((a, b) => b[1] - a[1]) .map(([word]) => word) .slice(0, 10); // Top 10 keywords } private expandWithSynonyms(text: string): string { let expanded = text; for (const [word, synonyms] of this.synonyms) { if (expanded.includes(word)) { // Add synonyms to the query expanded += " " + synonyms.join(" "); } } return expanded; } private detectLanguage(text: string): string { // Simple language detection const koreanRegex = /[가-힣]/g; const koreanMatches = text.match(koreanRegex); if (koreanMatches && koreanMatches.length > text.length * 0.1) { return "ko"; } return "en"; } private extractEntities(text: string): string[] { const entities: string[] = []; // Simple entity extraction patterns const patterns = { email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, phone: /\b\d{3}-\d{3,4}-\d{4}\b/g, date: /\b\d{4}[-/.]\d{1,2}[-/.]\d{1,2}\b/g, number: /\b\d+\b/g, }; for (const [type, pattern] of Object.entries(patterns)) { const matches = text.match(pattern); if (matches) { entities.push(...matches.map((match) => `${type}:${match}`)); } } return entities; } private determineIntent(query: string): string { const lowerQuery = query.toLowerCase(); // Simple intent classification if (lowerQuery.includes("how") || lowerQuery.includes("어떻게")) { return "how-to"; } if (lowerQuery.includes("what") || lowerQuery.includes("무엇")) { return "definition"; } if (lowerQuery.includes("where") || lowerQuery.includes("어디")) { return "location"; } if (lowerQuery.includes("when") || lowerQuery.includes("언제")) { return "time"; } if (lowerQuery.includes("why") || lowerQuery.includes("왜")) { return "explanation"; } return "general"; } private initializeStopWords(): void { // Combined English and Korean stop words const stopWordsList = [ // English "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "or", "but", "not", "this", "these", "they", "their", "we", "you", "your", "have", "had", "can", "could", // Korean "그", "이", "저", "것", "수", "있", "없", "하", "되", "시", "가", "를", "에", "의", "은", "는", "이", "가", "을", "를", "에서", "로", "으로", "와", "과", "도", "만", "뿐", "까지", "부터", "이다", "있다", "없다", ]; this.stopWords = new Set(stopWordsList); } private initializeSynonyms(): void { // Basic synonym mapping this.synonyms = new Map([ ["ai", ["artificial intelligence", "machine learning", "deep learning"]], ["인공지능", ["AI", "머신러닝", "딥러닝", "기계학습"]], ["개발", ["프로그래밍", "코딩", "소프트웨어 개발"]], ["문서", ["파일", "자료", "데이터"]], ["검색", ["찾기", "조회", "탐색"]], ]); } // Method to add custom synonyms addSynonyms(word: string, synonyms: string[]): void { this.synonyms.set(word, synonyms); } // Method to add custom stop words addStopWords(words: string[]): void { words.forEach((word) => this.stopWords.add(word.toLowerCase())); } }