autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
319 lines (318 loc) • 11.2 kB
JavaScript
/**
* FieldWeightedScorer — 加权字段匹配评分器
*
* 替代 BM25Scorer 作为结构化知识库的默认搜索评分引擎。
*
* 设计动机:
* - BM25 将所有字段拼接为文本做统计评分,tokenize 去重导致 TF 恒为 1,BM25F boost 失效
* - 对于 ~50–500 条结构化知识条目,BM25 的大规模语料假设不成立
* - FieldWeightedScorer 对每个字段独立打分并加权合并,精确匹配 > token 重叠 > IDF 加权
*
* 字段权重:
* trigger (5.0) > title (3.0) > tags (2.0) > description (1.5) > content (1.0) > facets (0.5)
*
* @module FieldWeightedScorer
*/
import { tokenize } from './tokenizer.js';
// ── 字段权重常量(可调) ──
const TRIGGER_WEIGHT = 5.0;
const TITLE_WEIGHT = 3.0;
const TAG_WEIGHT = 2.0;
const DESCRIPTION_WEIGHT = 1.5;
const CONTENT_WEIGHT = 1.0;
const FACET_WEIGHT = 0.5;
/**
* FieldWeightedScorer — 加权字段匹配评分器
*
* 接口与 BM25Scorer 完全兼容(实现 Scorer 接口),可作为 drop-in 替换。
*/
export class FieldWeightedScorer {
avgLength;
docFreq;
documents;
totalDocs;
_idIndex;
_totalLength;
constructor() {
this.documents = [];
this.totalDocs = 0;
this.docFreq = {};
this._idIndex = new Map();
this._totalLength = 0;
this.avgLength = 0;
}
/** 添加文档到索引 */
addDocument(id, text, meta = {}) {
if (this._idIndex.has(id)) {
this.removeDocument(id);
}
// 从 meta 提取结构化字段
const trigger = meta.trigger || '';
const title = meta.title || '';
const description = meta.description || '';
const tags = Array.isArray(meta.tags) ? meta.tags : [];
const language = meta.language || '';
const category = meta.category || '';
const knowledgeType = meta.knowledgeType || '';
const contentText = meta.contentText || '';
// 独立分词每个字段
const triggerTokens = tokenize(trigger);
const titleTokens = tokenize(title);
const descTokens = tokenize(description);
// contentText 优先;若 meta 无 contentText 则用拼接文本 text 作为回退
const contentTokens = tokenize(contentText || text);
// 合并所有唯一 token 用于 DF 计算
const allUnique = new Set();
for (const t of triggerTokens) {
allUnique.add(t);
}
for (const t of titleTokens) {
allUnique.add(t);
}
for (const t of descTokens) {
allUnique.add(t);
}
for (const t of contentTokens) {
allUnique.add(t);
}
for (const tag of tags) {
for (const t of tokenize(tag)) {
allUnique.add(t);
}
}
const doc = {
id,
fields: { trigger, title, description, tags, language, category, knowledgeType },
tokenizedFields: {
trigger: triggerTokens,
title: titleTokens,
description: descTokens,
content: contentTokens,
allUnique,
},
meta,
};
const idx = this.documents.length;
this.documents.push(doc);
this._idIndex.set(id, idx);
for (const token of allUnique) {
this.docFreq[token] = (this.docFreq[token] || 0) + 1;
}
this.totalDocs = this._idIndex.size;
this._totalLength += allUnique.size;
this.avgLength = this.totalDocs > 0 ? this._totalLength / this.totalDocs : 0;
}
/**
* 移除文档(tombstone + 懒压缩)
* @returns 是否成功移除
*/
removeDocument(id) {
const idx = this._idIndex.get(id);
if (idx === undefined) {
return false;
}
const doc = this.documents[idx];
if (!doc) {
return false;
}
for (const token of doc.tokenizedFields.allUnique) {
if (this.docFreq[token]) {
this.docFreq[token]--;
if (this.docFreq[token] <= 0) {
delete this.docFreq[token];
}
}
}
this._totalLength -= doc.tokenizedFields.allUnique.size;
this.documents[idx] = null;
this._idIndex.delete(id);
this.totalDocs = this._idIndex.size;
this.avgLength = this.totalDocs > 0 ? this._totalLength / this.totalDocs : 0;
const nullCount = this.documents.length - this.totalDocs;
if (this.documents.length > 100 && nullCount / this.documents.length > 0.3) {
this._compact();
}
return true;
}
/** 更新文档(remove + add) */
updateDocument(id, text, meta = {}) {
this.removeDocument(id);
this.addDocument(id, text, meta);
}
/** 检查文档是否存在 */
hasDocument(id) {
return this._idIndex.has(id);
}
/** 清空索引 */
clear() {
this.documents = [];
this.docFreq = {};
this.totalDocs = 0;
this._totalLength = 0;
this.avgLength = 0;
this._idIndex.clear();
}
/** 压缩 documents 数组,清除 tombstone 空洞 */
_compact() {
const alive = this.documents.filter((d) => d !== null);
this.documents = alive;
this._idIndex.clear();
for (let i = 0; i < alive.length; i++) {
this._idIndex.set(alive[i].id, i);
}
}
/** 搜索:对每个文档按字段加权评分,返回降序结果 */
search(query, limit = 20) {
const queryTokens = tokenize(query);
if (queryTokens.length === 0) {
return [];
}
const scores = [];
for (const doc of this.documents) {
if (!doc) {
continue;
}
let totalScore = 0;
// 1. Trigger 评分 — 最高权重,精确标识
const triggerString = this._stringMatchScore(query, doc.fields.trigger);
const triggerToken = this._tokenOverlap(queryTokens, doc.tokenizedFields.trigger);
totalScore += TRIGGER_WEIGHT * Math.max(triggerString, triggerToken);
// 2. Title 评分 — 主要描述性字段
const titleString = this._stringMatchScore(query, doc.fields.title);
const titleToken = this._tokenOverlap(queryTokens, doc.tokenizedFields.title);
totalScore += TITLE_WEIGHT * Math.max(titleString, titleToken);
// 3. Tags 评分 — 分类标记
totalScore += TAG_WEIGHT * this._tagScore(queryTokens, doc.fields.tags);
// 4. Description 评分 — IDF 加权 token overlap
totalScore +=
DESCRIPTION_WEIGHT * this._idfWeightedOverlap(queryTokens, doc.tokenizedFields.description);
// 5. Content 评分 — IDF 加权 token overlap
totalScore +=
CONTENT_WEIGHT * this._idfWeightedOverlap(queryTokens, doc.tokenizedFields.content);
// 6. Facet 评分 — language/category/knowledgeType 精确匹配
totalScore += FACET_WEIGHT * this._facetScore(queryTokens, doc.fields);
if (totalScore > 0) {
scores.push({ id: doc.id, score: totalScore, meta: doc.meta });
}
}
scores.sort((a, b) => b.score - a.score);
return scores.slice(0, limit);
}
// ── 内部评分方法 ──
/** 字符串级别匹配评分(用于 trigger / title) */
_stringMatchScore(query, field) {
if (!field) {
return 0;
}
const q = query.toLowerCase();
const f = field.toLowerCase();
if (f === q) {
return 1.0;
}
if (f.startsWith(q)) {
return 0.7;
}
if (f.includes(q)) {
return 0.5;
}
if (q.includes(f) && f.length > 3) {
return 0.3;
}
return 0;
}
/** Token 集合重叠率(查询侧召回) */
_tokenOverlap(queryTokens, fieldTokens) {
if (queryTokens.length === 0) {
return 0;
}
const fieldSet = new Set(fieldTokens);
let matched = 0;
for (const qt of queryTokens) {
if (fieldSet.has(qt)) {
matched++;
}
}
return matched / queryTokens.length;
}
/** IDF 加权 token overlap(用于长文本字段) */
_idfWeightedOverlap(queryTokens, fieldTokens) {
if (queryTokens.length === 0) {
return 0;
}
const fieldSet = new Set(fieldTokens);
let matchedIdf = 0;
let totalIdf = 0;
for (const qt of queryTokens) {
const idf = this._idf(qt);
totalIdf += idf;
if (fieldSet.has(qt)) {
matchedIdf += idf;
}
}
return totalIdf > 0 ? matchedIdf / totalIdf : 0;
}
/** Tag 匹配评分 */
_tagScore(queryTokens, tags) {
if (tags.length === 0 || queryTokens.length === 0) {
return 0;
}
let score = 0;
const qtSet = new Set(queryTokens);
for (const tag of tags) {
const lowTag = tag.toLowerCase();
// 精确 token 匹配
if (qtSet.has(lowTag)) {
score += 1.0;
continue;
}
// 部分匹配:query token 包含 tag 或 tag 包含 query token
let partialFound = false;
for (const qt of queryTokens) {
if (lowTag.includes(qt) || qt.includes(lowTag)) {
score += 0.5;
partialFound = true;
break;
}
}
if (!partialFound) {
// 对 tag 分词再匹配
const tagTokens = tokenize(tag);
for (const tt of tagTokens) {
if (qtSet.has(tt)) {
score += 0.3;
break;
}
}
}
}
return Math.min(score / queryTokens.length, 1.0);
}
/** Facet 匹配评分(language / category / knowledgeType) */
_facetScore(queryTokens, fields) {
const facets = [fields.language, fields.category, fields.knowledgeType].filter(Boolean);
if (facets.length === 0) {
return 0;
}
let matched = 0;
const qtSet = new Set(queryTokens);
for (const facet of facets) {
const lower = facet.toLowerCase();
if (qtSet.has(lower)) {
matched++;
continue;
}
for (const ft of tokenize(facet)) {
if (qtSet.has(ft)) {
matched++;
break;
}
}
}
return matched / facets.length;
}
/** 计算 IDF(平滑,始终为正) */
_idf(token) {
const df = this.docFreq[token] || 0;
return Math.log2(1 + this.totalDocs / (df + 1));
}
}