i18n-ai-translate
Version:
AI-powered localization CLI, Node library, and GitHub Action. Translate i18next JSON, Gettext PO, Java .properties, and iOS .strings with ChatGPT, Claude, Gemini, or local Ollama models.
524 lines (454 loc) • 16.6 kB
text/typescript
import {
DEFAULT_BATCH_SIZE,
DEFAULT_REQUEST_TOKENS,
DEFAULT_TEMPLATED_STRING_PREFIX,
DEFAULT_TEMPLATED_STRING_SUFFIX,
FLATTEN_DELIMITER,
} from "./constants";
import { distance } from "fastest-levenshtein";
import { flatten, unflatten } from "flat";
import { getCachedTranslation, setCachedTranslation } from "./cache";
import {
isValidLanguageCode,
printExecutionTime,
printInfo,
resolveLanguageCode,
} from "./utils";
import ChatPool from "./chat_pool";
import GenerateTranslationJSON from "./generate_json/generate";
import PromptMode from "./enums/prompt_mode";
import RateLimiter from "./rate_limiter";
import translateCSV from "./generate_csv/generate";
import type { TranslationStats, TranslationStatsItem } from "./types";
import type TranslateDiffOptions from "./interfaces/translate_diff_options";
import type TranslateOptions from "./interfaces/translate_options";
import type TranslationContext from "./interfaces/translation_context";
function getPool(options: TranslateOptions): ChatPool {
// When the caller (typically cli_translate.ts in language-concurrent
// mode) supplies its own pool, reuse it. This is what makes the
// shared TPM budget actually shared across parallel languages — a
// fresh pool here would give each language its own limiter and
// defeat the cap.
if (options.pool) return options.pool;
const rateLimiter =
options.rateLimiter ??
new RateLimiter(
options.rateLimitMs,
options.verbose as boolean,
options.tokensPerMinute,
);
return ChatPool.create({
apiKey: options.apiKey,
chatParams: options.chatParams,
concurrency: Math.max(1, options.concurrency ?? 1),
engine: options.engine,
host: options.host,
model: options.model,
rateLimiter,
});
}
function replaceNewlinesWithPlaceholder(
templatedStringPrefix: string,
templatedStringSuffix: string,
flatInput: { [key: string]: string },
): void {
for (const key in flatInput) {
if (Object.prototype.hasOwnProperty.call(flatInput, key)) {
flatInput[key] = flatInput[key].replaceAll(
"\n",
`${templatedStringPrefix}NEWLINE${templatedStringSuffix}`,
);
}
}
}
function replacePlaceholderWithNewLines(
templatedStringPrefix: string,
templatedStringSuffix: string,
sortedOutput: { [key: string]: string },
): void {
for (const key in sortedOutput) {
if (Object.prototype.hasOwnProperty.call(sortedOutput, key)) {
sortedOutput[key] = sortedOutput[key].replaceAll(
`${templatedStringPrefix}NEWLINE${templatedStringSuffix}`,
"\n",
);
}
}
}
function groupSimilarValues(flatInput: { [key: string]: string }): {
flatInput: { [key: string]: string };
groups: Array<{ [key: string]: string }>;
} {
const groups: Array<{ [key: string]: string }> = [];
for (const key in flatInput) {
if (Object.prototype.hasOwnProperty.call(flatInput, key)) {
const val = flatInput[key];
const existingGroup = groups.find((group) =>
Object.values(group).some((entry) => {
const distPercent =
distance(val, entry) /
Math.max(val.length, entry.length);
return distPercent < 0.3;
}),
);
if (existingGroup) {
existingGroup[key] = val;
} else {
groups.push({ [key]: val });
}
}
}
for (let i = groups.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[groups[i], groups[j]] = [groups[j], groups[i]];
}
flatInput = {};
for (const groupObj of groups) {
for (const [k, v] of Object.entries(groupObj)) {
flatInput[k] = v;
}
}
return { flatInput, groups };
}
function startTranslationStatsItem(): TranslationStatsItem {
return {
batchStartTime: 0,
enqueuedItems: 0,
processedItems: 0,
processedTokens: 0,
totalItems: 0,
totalTokens: 0,
} as TranslationStatsItem;
}
function startTranslationStats(): TranslationStats {
return {
translate: startTranslationStatsItem(),
verify: startTranslationStatsItem(),
} as TranslationStats;
}
async function getTranslation(
ctx: TranslationContext,
): Promise<{ [key: string]: string }> {
if (ctx.options.verbose) {
printInfo(`Translation prompting mode: ${ctx.options.promptMode}\n`);
}
switch (ctx.options.promptMode) {
case PromptMode.JSON: {
const generateTranslationJSON = new GenerateTranslationJSON(
ctx.options,
);
return generateTranslationJSON.translateJSON(ctx);
}
case PromptMode.CSV:
return translateCSV(ctx);
default:
throw new Error("Prompt mode is not set");
}
}
function setDefaults(options: TranslateOptions): void {
if (!options.templatedStringPrefix)
options.templatedStringPrefix = DEFAULT_TEMPLATED_STRING_PREFIX;
if (!options.templatedStringSuffix)
options.templatedStringSuffix = DEFAULT_TEMPLATED_STRING_SUFFIX;
if (!options.batchMaxTokens)
options.batchMaxTokens = DEFAULT_REQUEST_TOKENS;
if (!options.batchSize) options.batchSize = DEFAULT_BATCH_SIZE;
if (!options.verbose) options.verbose = false;
if (!options.ensureChangedTranslation)
options.ensureChangedTranslation = false;
if (!options.skipTranslationVerification)
options.skipTranslationVerification = false;
if (!options.skipStylingVerification)
options.skipStylingVerification = false;
if (options.continueOnError === undefined) options.continueOnError = true;
}
/**
* Translate the input JSON to the given language
* @param options - The options for the translation
*/
export async function translate(options: TranslateOptions): Promise<Object> {
setDefaults(options);
// Accept both codes and English language names. If a user passed
// "English" we normalise to "en" and note the substitution so they
// know it happened.
const resolvedInput = resolveLanguageCode(options.inputLanguageCode);
if (resolvedInput !== options.inputLanguageCode) {
if (options.verbose) {
printInfo(
`Interpreted '${options.inputLanguageCode}' as '${resolvedInput}'`,
);
}
options.inputLanguageCode = resolvedInput;
}
const resolvedOutput = resolveLanguageCode(options.outputLanguageCode);
if (resolvedOutput !== options.outputLanguageCode) {
if (options.verbose) {
printInfo(
`Interpreted '${options.outputLanguageCode}' as '${resolvedOutput}'`,
);
}
options.outputLanguageCode = resolvedOutput;
}
// Validate the input and output languages are valid
if (!isValidLanguageCode(options.inputLanguageCode)) {
throw new Error(
`Invalid input language code: ${options.inputLanguageCode}`,
);
}
if (!isValidLanguageCode(options.outputLanguageCode)) {
throw new Error(
`Invalid output language code: ${options.outputLanguageCode}`,
);
}
if (options.verbose) {
printInfo(
`Translating from ${options.inputLanguageCode} to ${options.outputLanguageCode}...`,
);
}
const pool = getPool(options);
let flatInput = flatten(options.inputJSON, {
delimiter: FLATTEN_DELIMITER,
}) as {
[key: string]: string;
};
replaceNewlinesWithPlaceholder(
options.templatedStringPrefix as string,
options.templatedStringSuffix as string,
flatInput,
);
const canonicalToDupes: Record<string, string[]> = {};
const valueBuckets: Record<string, string[]> = {};
for (const [k, v] of Object.entries(flatInput)) {
(valueBuckets[v] ??= []).push(k);
}
for (const keys of Object.values(valueBuckets)) {
if (keys.length > 1) {
const [canonical, ...dupes] = keys;
canonicalToDupes[canonical] = dupes;
for (const k of dupes) {
delete flatInput[k];
}
}
}
if (options.verbose) {
for (const [canonical, dupes] of Object.entries(canonicalToDupes)) {
printInfo(
`De-duplicating ${canonical}\n=>\n${dupes.join("\n")}\n\n`,
);
}
}
// Translation memory: pull any source string already in the cache
// out of the work set so only misses reach the model. This extends
// the in-file de-duplication above across runs and files. Hits are
// merged back into the output below; misses are recorded after.
const { cache } = options;
const cachedOutput: Record<string, string> = {};
const missSourceByKey: Record<string, string> = {};
if (cache) {
for (const [key, source] of Object.entries(flatInput)) {
const hit = getCachedTranslation(
cache,
options.inputLanguageCode,
options.outputLanguageCode,
options.context ?? "",
source,
);
if (hit !== undefined) {
cachedOutput[key] = hit;
delete flatInput[key];
} else {
missSourceByKey[key] = source;
}
}
if (options.verbose) {
printInfo(
`Cache: ${Object.keys(cachedOutput).length} hit(s), ${
Object.keys(missSourceByKey).length
} miss(es)`,
);
}
}
const grouped = groupSimilarValues(flatInput);
flatInput = grouped.flatInput;
const translationStats = startTranslationStats();
const translated = await getTranslation({
flatInput,
groups: grouped.groups,
options,
pool,
stats: translationStats,
});
// Record freshly translated strings so the next run can reuse them.
if (cache) {
for (const [key, source] of Object.entries(missSourceByKey)) {
const value = translated[key];
if (value !== undefined) {
setCachedTranslation(
cache,
options.inputLanguageCode,
options.outputLanguageCode,
options.context ?? "",
source,
value,
);
}
}
}
const output = { ...cachedOutput, ...translated };
for (const [canonical, dupes] of Object.entries(canonicalToDupes)) {
const canonicalTranslation = output[canonical];
for (const k of dupes) {
output[k] = canonicalTranslation;
}
}
const sortedOutput: Record<string, string> = {};
for (const key of Object.keys(output).sort()) {
sortedOutput[key] = output[key];
}
replacePlaceholderWithNewLines(
options.templatedStringPrefix as string,
options.templatedStringSuffix as string,
sortedOutput,
);
const unflattenedOutput = unflatten(sortedOutput, {
delimiter: FLATTEN_DELIMITER,
});
if (options.verbose) {
printExecutionTime(
translationStats.translate.batchStartTime,
"Total execution time: ",
);
}
return unflattenedOutput as Object;
}
/**
* Translate the difference of an input JSON to the given languages
* @param options - The options for the translation
*/
export async function translateDiff(
options: TranslateDiffOptions,
): Promise<{ [language: string]: Object }> {
const flatInputBefore = flatten(options.inputJSONBefore, {
delimiter: FLATTEN_DELIMITER,
}) as {
[key: string]: string;
};
const flatInputAfter = flatten(options.inputJSONAfter, {
delimiter: FLATTEN_DELIMITER,
}) as {
[key: string]: string;
};
const flatToUpdateJSONs: { [language: string]: { [key: string]: string } } =
{};
for (const lang in options.toUpdateJSONs) {
if (Object.prototype.hasOwnProperty.call(options.toUpdateJSONs, lang)) {
const flatToUpdateJSON = flatten(options.toUpdateJSONs[lang], {
delimiter: FLATTEN_DELIMITER,
}) as {
[key: string]: string;
};
flatToUpdateJSONs[lang] = flatToUpdateJSON;
}
}
const addedKeys = [];
const modifiedKeys = [];
const deletedKeys = [];
for (const key in flatInputBefore) {
if (flatInputBefore[key] !== flatInputAfter[key]) {
if (flatInputAfter[key] === undefined) {
deletedKeys.push(key);
} else {
modifiedKeys.push(key);
}
}
}
for (const key in flatInputAfter) {
if (flatInputBefore[key] === undefined) {
addedKeys.push(key);
}
}
if (options.verbose) {
printInfo(`Added keys: ${addedKeys.join("\n")}\n`);
printInfo(`Modified keys: ${modifiedKeys.join("\n")}\n`);
printInfo(`Deleted keys: ${deletedKeys.join("\n")}\n`);
}
for (const key of deletedKeys) {
for (const lang in flatToUpdateJSONs) {
if (Object.prototype.hasOwnProperty.call(flatToUpdateJSONs, lang)) {
delete flatToUpdateJSONs[lang][key];
}
}
}
const translatedJSONs: { [language: string]: { [key: string]: string } } =
{};
for (const languageCode in flatToUpdateJSONs) {
if (
Object.prototype.hasOwnProperty.call(
flatToUpdateJSONs,
languageCode,
)
) {
// Seed with the existing per-language map (minus the keys
// deleted upstream) so unchanged translations are preserved.
// Without this the accumulator would hold only the delta and
// writing it to disk would wipe every pre-existing key.
translatedJSONs[languageCode] = {
...flatToUpdateJSONs[languageCode],
};
const addedAndModifiedTranslations: { [key: string]: string } = {};
for (const key of addedKeys) {
addedAndModifiedTranslations[key] = flatInputAfter[key];
}
for (const key of modifiedKeys) {
addedAndModifiedTranslations[key] = flatInputAfter[key];
}
// eslint-disable-next-line no-await-in-loop
const translated = await translate({
...options,
inputJSON: addedAndModifiedTranslations,
outputLanguageCode: languageCode,
});
const flatTranslated = flatten(translated, {
delimiter: FLATTEN_DELIMITER,
}) as {
[key: string]: string;
};
for (const key in flatTranslated) {
if (Object.prototype.hasOwnProperty.call(flatTranslated, key)) {
translatedJSONs[languageCode][key] = flatTranslated[key];
}
}
// Sort the keys
translatedJSONs[languageCode] = Object.keys(
translatedJSONs[languageCode],
)
.sort()
.reduce(
(obj, key) => {
obj[key] = translatedJSONs[languageCode][key];
return obj;
},
{} as { [key: string]: string },
);
if (options.onLanguageComplete) {
const unflattened = unflatten(translatedJSONs[languageCode], {
delimiter: FLATTEN_DELIMITER,
}) as Object;
options.onLanguageComplete(
languageCode,
unflattened,
translatedJSONs[languageCode],
);
}
}
}
const unflatToUpdateJSONs: { [language: string]: Object } = {};
for (const lang in translatedJSONs) {
if (Object.prototype.hasOwnProperty.call(translatedJSONs, lang)) {
unflatToUpdateJSONs[lang] = unflatten(translatedJSONs[lang], {
delimiter: FLATTEN_DELIMITER,
});
}
}
return unflatToUpdateJSONs;
}