@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
233 lines (231 loc) • 7.12 kB
JavaScript
class QueryProcessor {
constructor(config = {}) {
this.config = {
enableSpellCheck: false,
enableSynonymExpansion: true,
enableStopWordRemoval: true,
minQueryLength: 3,
maxQueryLength: 500,
...config,
};
this.initializeStopWords();
this.initializeSynonyms();
}
async processQuery(query) {
if (!this.isValidQuery(query)) {
throw new Error("Invalid query: too short or too long");
}
let processed = query.trim();
// Step 1: Normalize text
processed = this.normalizeText(processed);
// Step 2: Remove stop words (optional)
if (this.config.enableStopWordRemoval) {
processed = this.removeStopWords(processed);
}
// Step 3: Extract keywords
const keywords = this.extractKeywords(processed);
// Step 4: Expand with synonyms (optional)
if (this.config.enableSynonymExpansion) {
processed = this.expandWithSynonyms(processed);
}
// Step 5: Detect language
const language = this.detectLanguage(query);
// Step 6: Extract entities (basic implementation)
const entities = this.extractEntities(processed);
// Step 7: Determine intent (basic implementation)
const intent = this.determineIntent(query);
return {
original: query,
processed: processed.trim(),
keywords,
entities,
intent,
language,
};
}
isValidQuery(query) {
const length = query.trim().length;
return (length >= (this.config.minQueryLength || 3) &&
length <= (this.config.maxQueryLength || 500));
}
normalizeText(text) {
return text
.toLowerCase()
.replace(/[^\w\s가-힣]/g, " ") // Keep alphanumeric and Korean characters
.replace(/\s+/g, " ")
.trim();
}
removeStopWords(text) {
const words = text.split(" ");
const filteredWords = words.filter((word) => !this.stopWords.has(word));
return filteredWords.join(" ");
}
extractKeywords(text) {
const words = text.split(" ").filter((word) => word.length > 2);
// Simple keyword extraction - can be enhanced with TF-IDF or other methods
const wordFreq = new Map();
words.forEach((word) => {
wordFreq.set(word, (wordFreq.get(word) || 0) + 1);
});
// Return unique words sorted by frequency
return Array.from(wordFreq.entries())
.sort((a, b) => b[1] - a[1])
.map(([word]) => word)
.slice(0, 10); // Top 10 keywords
}
expandWithSynonyms(text) {
let expanded = text;
for (const [word, synonyms] of this.synonyms) {
if (expanded.includes(word)) {
// Add synonyms to the query
expanded += " " + synonyms.join(" ");
}
}
return expanded;
}
detectLanguage(text) {
// Simple language detection
const koreanRegex = /[가-힣]/g;
const koreanMatches = text.match(koreanRegex);
if (koreanMatches && koreanMatches.length > text.length * 0.1) {
return "ko";
}
return "en";
}
extractEntities(text) {
const entities = [];
// Simple entity extraction patterns
const patterns = {
email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
phone: /\b\d{3}-\d{3,4}-\d{4}\b/g,
date: /\b\d{4}[-/.]\d{1,2}[-/.]\d{1,2}\b/g,
number: /\b\d+\b/g,
};
for (const [type, pattern] of Object.entries(patterns)) {
const matches = text.match(pattern);
if (matches) {
entities.push(...matches.map((match) => `${type}:${match}`));
}
}
return entities;
}
determineIntent(query) {
const lowerQuery = query.toLowerCase();
// Simple intent classification
if (lowerQuery.includes("how") || lowerQuery.includes("어떻게")) {
return "how-to";
}
if (lowerQuery.includes("what") || lowerQuery.includes("무엇")) {
return "definition";
}
if (lowerQuery.includes("where") || lowerQuery.includes("어디")) {
return "location";
}
if (lowerQuery.includes("when") || lowerQuery.includes("언제")) {
return "time";
}
if (lowerQuery.includes("why") || lowerQuery.includes("왜")) {
return "explanation";
}
return "general";
}
initializeStopWords() {
// Combined English and Korean stop words
const stopWordsList = [
// English
"a",
"an",
"and",
"are",
"as",
"at",
"be",
"by",
"for",
"from",
"has",
"he",
"in",
"is",
"it",
"its",
"of",
"on",
"that",
"the",
"to",
"was",
"will",
"with",
"or",
"but",
"not",
"this",
"these",
"they",
"their",
"we",
"you",
"your",
"have",
"had",
"can",
"could",
// Korean
"그",
"이",
"저",
"것",
"수",
"있",
"없",
"하",
"되",
"시",
"가",
"를",
"에",
"의",
"은",
"는",
"이",
"가",
"을",
"를",
"에서",
"로",
"으로",
"와",
"과",
"도",
"만",
"뿐",
"까지",
"부터",
"이다",
"있다",
"없다",
];
this.stopWords = new Set(stopWordsList);
}
initializeSynonyms() {
// Basic synonym mapping
this.synonyms = new Map([
["ai", ["artificial intelligence", "machine learning", "deep learning"]],
["인공지능", ["AI", "머신러닝", "딥러닝", "기계학습"]],
["개발", ["프로그래밍", "코딩", "소프트웨어 개발"]],
["문서", ["파일", "자료", "데이터"]],
["검색", ["찾기", "조회", "탐색"]],
]);
}
// Method to add custom synonyms
addSynonyms(word, synonyms) {
this.synonyms.set(word, synonyms);
}
// Method to add custom stop words
addStopWords(words) {
words.forEach((word) => this.stopWords.add(word.toLowerCase()));
}
}
export { QueryProcessor };
//# sourceMappingURL=query-processor.js.map