autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
141 lines (140 loc) • 4.96 kB
JavaScript
/**
* BM25Scorer — BM25 全文检索评分器
*
* 从 SearchEngine.ts 提取的独立模块。
* 支持增量 add/remove/update、tombstone 压缩、O(1) ID 查找。
*
* @module BM25Scorer
*/
import { BM25_B, BM25_K1, tokenize } from './tokenizer.js';
/** BM25 评分器 */
export class BM25Scorer {
_idIndex;
_totalLength;
avgLength;
docFreq;
documents;
totalDocs;
constructor() {
this.documents = []; // [{id, tokens, tokenFreq, length, meta}]
this.avgLength = 0;
this.docFreq = {}; // token → 出现在多少文档中
this.totalDocs = 0;
this._totalLength = 0; // 累计文档长度,避免 O(N) 重算
this._idIndex = new Map(); // id → array index (O(1) 查找)
}
/** 添加文档到索引 */
addDocument(id, text, meta = {}) {
// 如果 id 已存在,先移除旧版本(确保幂等)
if (this._idIndex.has(id)) {
this.removeDocument(id);
}
const tokens = tokenize(text);
// 预计算 token frequency map — 避免 search 时 O(T) filter 计算 TF
const tokenFreq = {};
for (const t of tokens) {
tokenFreq[t] = (tokenFreq[t] || 0) + 1;
}
const idx = this.documents.length;
this.documents.push({ id, tokens, tokenFreq, length: tokens.length, meta });
this._idIndex.set(id, idx);
for (const token of new Set(tokens)) {
this.docFreq[token] = (this.docFreq[token] || 0) + 1;
}
this.totalDocs = this._idIndex.size;
this._totalLength += tokens.length;
this.avgLength = this.totalDocs > 0 ? this._totalLength / this.totalDocs : 0;
}
/**
* 移除文档(增量删除)
* 采用标记删除 + 懒清理策略:将文档标记为 null,当空洞率 > 30% 时自动压缩
* @returns 是否成功移除
*/
removeDocument(id) {
const idx = this._idIndex.get(id);
if (idx === undefined) {
return false;
}
const doc = this.documents[idx];
if (!doc) {
return false; // 已被标记删除
}
// 递减 docFreq
for (const token of new Set(doc.tokens)) {
if (this.docFreq[token]) {
this.docFreq[token]--;
if (this.docFreq[token] <= 0) {
delete this.docFreq[token];
}
}
}
this._totalLength -= doc.length;
this.documents[idx] = null; // 标记删除(tombstone)
this._idIndex.delete(id);
this.totalDocs = this._idIndex.size;
this.avgLength = this.totalDocs > 0 ? this._totalLength / this.totalDocs : 0;
// 空洞率 > 30% 时压缩数组
const nullCount = this.documents.length - this.totalDocs;
if (this.documents.length > 100 && nullCount / this.documents.length > 0.3) {
this._compact();
}
return true;
}
/** 更新文档(增量: remove + add) */
updateDocument(id, text, meta = {}) {
this.removeDocument(id);
this.addDocument(id, text, meta);
}
/** 检查文档是否存在 */
hasDocument(id) {
return this._idIndex.has(id);
}
/** 压缩 documents 数组,清除 tombstone 空洞 */
_compact() {
const alive = this.documents.filter((d) => d !== null);
this.documents = alive;
this._idIndex.clear();
for (let i = 0; i < alive.length; i++) {
this._idIndex.set(alive[i].id, i);
}
}
/** 查询文档,返回按 BM25 分数排序的结果 */
search(query, limit = 20) {
const queryTokens = tokenize(query);
if (queryTokens.length === 0) {
return [];
}
const scores = [];
for (const doc of this.documents) {
if (!doc) {
continue; // skip tombstone
}
let score = 0;
const dl = doc.length;
for (const qt of queryTokens) {
const tf = doc.tokenFreq[qt] || 0; // O(1) 查找,替代 O(T) filter
if (tf === 0) {
continue;
}
const df = this.docFreq[qt] || 0;
const idf = Math.log((this.totalDocs - df + 0.5) / (df + 0.5) + 1);
const tfNorm = (tf * (BM25_K1 + 1)) / (tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / this.avgLength)));
score += idf * tfNorm;
}
if (score > 0) {
scores.push({ id: doc.id, score, meta: doc.meta });
}
}
scores.sort((a, b) => b.score - a.score);
return scores.slice(0, limit);
}
/** 清空索引 */
clear() {
this.documents = [];
this.docFreq = {};
this.totalDocs = 0;
this.avgLength = 0;
this._totalLength = 0;
this._idIndex.clear();
}
}