@awesome-fe/translate
Version:
Translation utils
83 lines • 3.42 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.VectorizerEngine = void 0;
const translation_engine_1 = require("./translation-engine");
const sentence_formatter_1 = require("./sentence-formatter");
const fs_1 = require("fs");
const path_1 = require("path");
const get_new_filename_for_1 = require("./get-new-filename-for");
const get_distance_1 = require("../utils/get-distance");
const common_1 = require("../dom/common");
const create_embedding_1 = require("../utils/create-embedding");
class VectorizerEngine extends translation_engine_1.TranslationEngine {
options;
constructor(options = {}) {
super();
this.options = options;
}
_entries = [];
get entries() {
return this._entries;
}
getDictFilename() {
const dict = this.options.dict;
if (dict) {
const cwd = this.options.cwd;
const currentFile = this.currentFile;
return (0, get_new_filename_for_1.getNewFilenameFor)(currentFile, cwd, dict, '.vector.json');
}
}
async batchTranslate(sentences, format) {
if ((0, fs_1.existsSync)(this.getDictFilename())) {
return sentences;
}
// 向量化只针对有翻译结果的句子
const filteredSentences = sentences
.filter(([original, translation]) => !!translation && (0, common_1.containsChinese)(translation) && !original.includes('<div className="breadcrumb-container">'))
.map(([original, translation]) => [
sentence_formatter_1.SentenceFormatter.toMarkdown(original.trim(), format),
sentence_formatter_1.SentenceFormatter.toMarkdown(translation.trim(), format),
]);
if (!filteredSentences.length) {
return sentences;
}
const originalVectors = await this.getVectors(filteredSentences.map(it => it[0]));
const translatedVectors = await this.getVectors(filteredSentences.map(it => it[1]));
for (let i = 0; i < filteredSentences.length; ++i) {
const [english, chinese] = filteredSentences[i];
const englishVector = originalVectors[i];
const chineseVector = translatedVectors[i];
const distance = (0, get_distance_1.getDistance)(englishVector, chineseVector);
if (distance > 0.2) {
console.warn(`distance: ${distance}, english: ${english}, chinese: ${chinese}`);
}
const newEntry = {
english,
englishVector,
chinese,
chineseVector,
distance,
};
this._entries.push(newEntry);
}
// 原封不动返回,因为我们并不想改变翻译结果
return sentences;
}
async setup(currentFile) {
await super.setup(currentFile);
this._entries = [];
}
async tearDown() {
const dict = this.getDictFilename();
if (dict && this.entries.length) {
const dir = (0, path_1.dirname)(dict);
(0, fs_1.mkdirSync)(dir, { recursive: true });
(0, fs_1.writeFileSync)(dict, JSON.stringify(this.entries), 'utf8');
}
}
async getVectors(texts) {
return (0, create_embedding_1.createEmbedding)(texts);
}
}
exports.VectorizerEngine = VectorizerEngine;
//# sourceMappingURL=vectorizer-engine.js.map