@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
289 lines (249 loc) • 6.84 kB
text/typescript
export interface QueryProcessingConfig {
enableSpellCheck?: boolean;
enableSynonymExpansion?: boolean;
enableStopWordRemoval?: boolean;
minQueryLength?: number;
maxQueryLength?: number;
}
export interface ProcessedQuery {
original: string;
processed: string;
keywords: string[];
entities?: string[];
intent?: string;
language?: string;
}
export class QueryProcessor {
private config: QueryProcessingConfig;
private stopWords!: Set<string>;
private synonyms!: Map<string, string[]>;
constructor(config: QueryProcessingConfig = {}) {
this.config = {
enableSpellCheck: false,
enableSynonymExpansion: true,
enableStopWordRemoval: true,
minQueryLength: 3,
maxQueryLength: 500,
...config,
};
this.initializeStopWords();
this.initializeSynonyms();
}
async processQuery(query: string): Promise<ProcessedQuery> {
if (!this.isValidQuery(query)) {
throw new Error("Invalid query: too short or too long");
}
let processed = query.trim();
// Step 1: Normalize text
processed = this.normalizeText(processed);
// Step 2: Remove stop words (optional)
if (this.config.enableStopWordRemoval) {
processed = this.removeStopWords(processed);
}
// Step 3: Extract keywords
const keywords = this.extractKeywords(processed);
// Step 4: Expand with synonyms (optional)
if (this.config.enableSynonymExpansion) {
processed = this.expandWithSynonyms(processed);
}
// Step 5: Detect language
const language = this.detectLanguage(query);
// Step 6: Extract entities (basic implementation)
const entities = this.extractEntities(processed);
// Step 7: Determine intent (basic implementation)
const intent = this.determineIntent(query);
return {
original: query,
processed: processed.trim(),
keywords,
entities,
intent,
language,
};
}
private isValidQuery(query: string): boolean {
const length = query.trim().length;
return (
length >= (this.config.minQueryLength || 3) &&
length <= (this.config.maxQueryLength || 500)
);
}
private normalizeText(text: string): string {
return text
.toLowerCase()
.replace(/[^\w\s가-힣]/g, " ") // Keep alphanumeric and Korean characters
.replace(/\s+/g, " ")
.trim();
}
private removeStopWords(text: string): string {
const words = text.split(" ");
const filteredWords = words.filter((word) => !this.stopWords.has(word));
return filteredWords.join(" ");
}
private extractKeywords(text: string): string[] {
const words = text.split(" ").filter((word) => word.length > 2);
// Simple keyword extraction - can be enhanced with TF-IDF or other methods
const wordFreq = new Map<string, number>();
words.forEach((word) => {
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
});
// Return unique words sorted by frequency
return Array.from(wordFreq.entries())
.sort((a, b) => b[1] - a[1])
.map(([word]) => word)
.slice(0, 10); // Top 10 keywords
}
private expandWithSynonyms(text: string): string {
let expanded = text;
for (const [word, synonyms] of this.synonyms) {
if (expanded.includes(word)) {
// Add synonyms to the query
expanded += " " + synonyms.join(" ");
}
}
return expanded;
}
private detectLanguage(text: string): string {
// Simple language detection
const koreanRegex = /[가-힣]/g;
const koreanMatches = text.match(koreanRegex);
if (koreanMatches && koreanMatches.length > text.length * 0.1) {
return "ko";
}
return "en";
}
private extractEntities(text: string): string[] {
const entities: string[] = [];
// Simple entity extraction patterns
const patterns = {
email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
phone: /\b\d{3}-\d{3,4}-\d{4}\b/g,
date: /\b\d{4}[-/.]\d{1,2}[-/.]\d{1,2}\b/g,
number: /\b\d+\b/g,
};
for (const [type, pattern] of Object.entries(patterns)) {
const matches = text.match(pattern);
if (matches) {
entities.push(...matches.map((match) => `${type}:${match}`));
}
}
return entities;
}
private determineIntent(query: string): string {
const lowerQuery = query.toLowerCase();
// Simple intent classification
if (lowerQuery.includes("how") || lowerQuery.includes("어떻게")) {
return "how-to";
}
if (lowerQuery.includes("what") || lowerQuery.includes("무엇")) {
return "definition";
}
if (lowerQuery.includes("where") || lowerQuery.includes("어디")) {
return "location";
}
if (lowerQuery.includes("when") || lowerQuery.includes("언제")) {
return "time";
}
if (lowerQuery.includes("why") || lowerQuery.includes("왜")) {
return "explanation";
}
return "general";
}
private initializeStopWords(): void {
// Combined English and Korean stop words
const stopWordsList = [
// English
"a",
"an",
"and",
"are",
"as",
"at",
"be",
"by",
"for",
"from",
"has",
"he",
"in",
"is",
"it",
"its",
"of",
"on",
"that",
"the",
"to",
"was",
"will",
"with",
"or",
"but",
"not",
"this",
"these",
"they",
"their",
"we",
"you",
"your",
"have",
"had",
"can",
"could",
// Korean
"그",
"이",
"저",
"것",
"수",
"있",
"없",
"하",
"되",
"시",
"가",
"를",
"에",
"의",
"은",
"는",
"이",
"가",
"을",
"를",
"에서",
"로",
"으로",
"와",
"과",
"도",
"만",
"뿐",
"까지",
"부터",
"이다",
"있다",
"없다",
];
this.stopWords = new Set(stopWordsList);
}
private initializeSynonyms(): void {
// Basic synonym mapping
this.synonyms = new Map([
["ai", ["artificial intelligence", "machine learning", "deep learning"]],
["인공지능", ["AI", "머신러닝", "딥러닝", "기계학습"]],
["개발", ["프로그래밍", "코딩", "소프트웨어 개발"]],
["문서", ["파일", "자료", "데이터"]],
["검색", ["찾기", "조회", "탐색"]],
]);
}
// Method to add custom synonyms
addSynonyms(word: string, synonyms: string[]): void {
this.synonyms.set(word, synonyms);
}
// Method to add custom stop words
addStopWords(words: string[]): void {
words.forEach((word) => this.stopWords.add(word.toLowerCase()));
}
}