UNPKG

i18n-ai-translate

Version:

AI-powered localization CLI, Node library, and GitHub Action. Translate i18next JSON, Gettext PO, Java .properties, and iOS .strings with ChatGPT, Claude, Gemini, or local Ollama models.

524 lines (454 loc) 16.6 kB
import { DEFAULT_BATCH_SIZE, DEFAULT_REQUEST_TOKENS, DEFAULT_TEMPLATED_STRING_PREFIX, DEFAULT_TEMPLATED_STRING_SUFFIX, FLATTEN_DELIMITER, } from "./constants"; import { distance } from "fastest-levenshtein"; import { flatten, unflatten } from "flat"; import { getCachedTranslation, setCachedTranslation } from "./cache"; import { isValidLanguageCode, printExecutionTime, printInfo, resolveLanguageCode, } from "./utils"; import ChatPool from "./chat_pool"; import GenerateTranslationJSON from "./generate_json/generate"; import PromptMode from "./enums/prompt_mode"; import RateLimiter from "./rate_limiter"; import translateCSV from "./generate_csv/generate"; import type { TranslationStats, TranslationStatsItem } from "./types"; import type TranslateDiffOptions from "./interfaces/translate_diff_options"; import type TranslateOptions from "./interfaces/translate_options"; import type TranslationContext from "./interfaces/translation_context"; function getPool(options: TranslateOptions): ChatPool { // When the caller (typically cli_translate.ts in language-concurrent // mode) supplies its own pool, reuse it. This is what makes the // shared TPM budget actually shared across parallel languages — a // fresh pool here would give each language its own limiter and // defeat the cap. if (options.pool) return options.pool; const rateLimiter = options.rateLimiter ?? new RateLimiter( options.rateLimitMs, options.verbose as boolean, options.tokensPerMinute, ); return ChatPool.create({ apiKey: options.apiKey, chatParams: options.chatParams, concurrency: Math.max(1, options.concurrency ?? 1), engine: options.engine, host: options.host, model: options.model, rateLimiter, }); } function replaceNewlinesWithPlaceholder( templatedStringPrefix: string, templatedStringSuffix: string, flatInput: { [key: string]: string }, ): void { for (const key in flatInput) { if (Object.prototype.hasOwnProperty.call(flatInput, key)) { flatInput[key] = flatInput[key].replaceAll( "\n", `${templatedStringPrefix}NEWLINE${templatedStringSuffix}`, ); } } } function replacePlaceholderWithNewLines( templatedStringPrefix: string, templatedStringSuffix: string, sortedOutput: { [key: string]: string }, ): void { for (const key in sortedOutput) { if (Object.prototype.hasOwnProperty.call(sortedOutput, key)) { sortedOutput[key] = sortedOutput[key].replaceAll( `${templatedStringPrefix}NEWLINE${templatedStringSuffix}`, "\n", ); } } } function groupSimilarValues(flatInput: { [key: string]: string }): { flatInput: { [key: string]: string }; groups: Array<{ [key: string]: string }>; } { const groups: Array<{ [key: string]: string }> = []; for (const key in flatInput) { if (Object.prototype.hasOwnProperty.call(flatInput, key)) { const val = flatInput[key]; const existingGroup = groups.find((group) => Object.values(group).some((entry) => { const distPercent = distance(val, entry) / Math.max(val.length, entry.length); return distPercent < 0.3; }), ); if (existingGroup) { existingGroup[key] = val; } else { groups.push({ [key]: val }); } } } for (let i = groups.length - 1; i > 0; i--) { const j = Math.floor(Math.random() * (i + 1)); [groups[i], groups[j]] = [groups[j], groups[i]]; } flatInput = {}; for (const groupObj of groups) { for (const [k, v] of Object.entries(groupObj)) { flatInput[k] = v; } } return { flatInput, groups }; } function startTranslationStatsItem(): TranslationStatsItem { return { batchStartTime: 0, enqueuedItems: 0, processedItems: 0, processedTokens: 0, totalItems: 0, totalTokens: 0, } as TranslationStatsItem; } function startTranslationStats(): TranslationStats { return { translate: startTranslationStatsItem(), verify: startTranslationStatsItem(), } as TranslationStats; } async function getTranslation( ctx: TranslationContext, ): Promise<{ [key: string]: string }> { if (ctx.options.verbose) { printInfo(`Translation prompting mode: ${ctx.options.promptMode}\n`); } switch (ctx.options.promptMode) { case PromptMode.JSON: { const generateTranslationJSON = new GenerateTranslationJSON( ctx.options, ); return generateTranslationJSON.translateJSON(ctx); } case PromptMode.CSV: return translateCSV(ctx); default: throw new Error("Prompt mode is not set"); } } function setDefaults(options: TranslateOptions): void { if (!options.templatedStringPrefix) options.templatedStringPrefix = DEFAULT_TEMPLATED_STRING_PREFIX; if (!options.templatedStringSuffix) options.templatedStringSuffix = DEFAULT_TEMPLATED_STRING_SUFFIX; if (!options.batchMaxTokens) options.batchMaxTokens = DEFAULT_REQUEST_TOKENS; if (!options.batchSize) options.batchSize = DEFAULT_BATCH_SIZE; if (!options.verbose) options.verbose = false; if (!options.ensureChangedTranslation) options.ensureChangedTranslation = false; if (!options.skipTranslationVerification) options.skipTranslationVerification = false; if (!options.skipStylingVerification) options.skipStylingVerification = false; if (options.continueOnError === undefined) options.continueOnError = true; } /** * Translate the input JSON to the given language * @param options - The options for the translation */ export async function translate(options: TranslateOptions): Promise<Object> { setDefaults(options); // Accept both codes and English language names. If a user passed // "English" we normalise to "en" and note the substitution so they // know it happened. const resolvedInput = resolveLanguageCode(options.inputLanguageCode); if (resolvedInput !== options.inputLanguageCode) { if (options.verbose) { printInfo( `Interpreted '${options.inputLanguageCode}' as '${resolvedInput}'`, ); } options.inputLanguageCode = resolvedInput; } const resolvedOutput = resolveLanguageCode(options.outputLanguageCode); if (resolvedOutput !== options.outputLanguageCode) { if (options.verbose) { printInfo( `Interpreted '${options.outputLanguageCode}' as '${resolvedOutput}'`, ); } options.outputLanguageCode = resolvedOutput; } // Validate the input and output languages are valid if (!isValidLanguageCode(options.inputLanguageCode)) { throw new Error( `Invalid input language code: ${options.inputLanguageCode}`, ); } if (!isValidLanguageCode(options.outputLanguageCode)) { throw new Error( `Invalid output language code: ${options.outputLanguageCode}`, ); } if (options.verbose) { printInfo( `Translating from ${options.inputLanguageCode} to ${options.outputLanguageCode}...`, ); } const pool = getPool(options); let flatInput = flatten(options.inputJSON, { delimiter: FLATTEN_DELIMITER, }) as { [key: string]: string; }; replaceNewlinesWithPlaceholder( options.templatedStringPrefix as string, options.templatedStringSuffix as string, flatInput, ); const canonicalToDupes: Record<string, string[]> = {}; const valueBuckets: Record<string, string[]> = {}; for (const [k, v] of Object.entries(flatInput)) { (valueBuckets[v] ??= []).push(k); } for (const keys of Object.values(valueBuckets)) { if (keys.length > 1) { const [canonical, ...dupes] = keys; canonicalToDupes[canonical] = dupes; for (const k of dupes) { delete flatInput[k]; } } } if (options.verbose) { for (const [canonical, dupes] of Object.entries(canonicalToDupes)) { printInfo( `De-duplicating ${canonical}\n=>\n${dupes.join("\n")}\n\n`, ); } } // Translation memory: pull any source string already in the cache // out of the work set so only misses reach the model. This extends // the in-file de-duplication above across runs and files. Hits are // merged back into the output below; misses are recorded after. const { cache } = options; const cachedOutput: Record<string, string> = {}; const missSourceByKey: Record<string, string> = {}; if (cache) { for (const [key, source] of Object.entries(flatInput)) { const hit = getCachedTranslation( cache, options.inputLanguageCode, options.outputLanguageCode, options.context ?? "", source, ); if (hit !== undefined) { cachedOutput[key] = hit; delete flatInput[key]; } else { missSourceByKey[key] = source; } } if (options.verbose) { printInfo( `Cache: ${Object.keys(cachedOutput).length} hit(s), ${ Object.keys(missSourceByKey).length } miss(es)`, ); } } const grouped = groupSimilarValues(flatInput); flatInput = grouped.flatInput; const translationStats = startTranslationStats(); const translated = await getTranslation({ flatInput, groups: grouped.groups, options, pool, stats: translationStats, }); // Record freshly translated strings so the next run can reuse them. if (cache) { for (const [key, source] of Object.entries(missSourceByKey)) { const value = translated[key]; if (value !== undefined) { setCachedTranslation( cache, options.inputLanguageCode, options.outputLanguageCode, options.context ?? "", source, value, ); } } } const output = { ...cachedOutput, ...translated }; for (const [canonical, dupes] of Object.entries(canonicalToDupes)) { const canonicalTranslation = output[canonical]; for (const k of dupes) { output[k] = canonicalTranslation; } } const sortedOutput: Record<string, string> = {}; for (const key of Object.keys(output).sort()) { sortedOutput[key] = output[key]; } replacePlaceholderWithNewLines( options.templatedStringPrefix as string, options.templatedStringSuffix as string, sortedOutput, ); const unflattenedOutput = unflatten(sortedOutput, { delimiter: FLATTEN_DELIMITER, }); if (options.verbose) { printExecutionTime( translationStats.translate.batchStartTime, "Total execution time: ", ); } return unflattenedOutput as Object; } /** * Translate the difference of an input JSON to the given languages * @param options - The options for the translation */ export async function translateDiff( options: TranslateDiffOptions, ): Promise<{ [language: string]: Object }> { const flatInputBefore = flatten(options.inputJSONBefore, { delimiter: FLATTEN_DELIMITER, }) as { [key: string]: string; }; const flatInputAfter = flatten(options.inputJSONAfter, { delimiter: FLATTEN_DELIMITER, }) as { [key: string]: string; }; const flatToUpdateJSONs: { [language: string]: { [key: string]: string } } = {}; for (const lang in options.toUpdateJSONs) { if (Object.prototype.hasOwnProperty.call(options.toUpdateJSONs, lang)) { const flatToUpdateJSON = flatten(options.toUpdateJSONs[lang], { delimiter: FLATTEN_DELIMITER, }) as { [key: string]: string; }; flatToUpdateJSONs[lang] = flatToUpdateJSON; } } const addedKeys = []; const modifiedKeys = []; const deletedKeys = []; for (const key in flatInputBefore) { if (flatInputBefore[key] !== flatInputAfter[key]) { if (flatInputAfter[key] === undefined) { deletedKeys.push(key); } else { modifiedKeys.push(key); } } } for (const key in flatInputAfter) { if (flatInputBefore[key] === undefined) { addedKeys.push(key); } } if (options.verbose) { printInfo(`Added keys: ${addedKeys.join("\n")}\n`); printInfo(`Modified keys: ${modifiedKeys.join("\n")}\n`); printInfo(`Deleted keys: ${deletedKeys.join("\n")}\n`); } for (const key of deletedKeys) { for (const lang in flatToUpdateJSONs) { if (Object.prototype.hasOwnProperty.call(flatToUpdateJSONs, lang)) { delete flatToUpdateJSONs[lang][key]; } } } const translatedJSONs: { [language: string]: { [key: string]: string } } = {}; for (const languageCode in flatToUpdateJSONs) { if ( Object.prototype.hasOwnProperty.call( flatToUpdateJSONs, languageCode, ) ) { // Seed with the existing per-language map (minus the keys // deleted upstream) so unchanged translations are preserved. // Without this the accumulator would hold only the delta and // writing it to disk would wipe every pre-existing key. translatedJSONs[languageCode] = { ...flatToUpdateJSONs[languageCode], }; const addedAndModifiedTranslations: { [key: string]: string } = {}; for (const key of addedKeys) { addedAndModifiedTranslations[key] = flatInputAfter[key]; } for (const key of modifiedKeys) { addedAndModifiedTranslations[key] = flatInputAfter[key]; } // eslint-disable-next-line no-await-in-loop const translated = await translate({ ...options, inputJSON: addedAndModifiedTranslations, outputLanguageCode: languageCode, }); const flatTranslated = flatten(translated, { delimiter: FLATTEN_DELIMITER, }) as { [key: string]: string; }; for (const key in flatTranslated) { if (Object.prototype.hasOwnProperty.call(flatTranslated, key)) { translatedJSONs[languageCode][key] = flatTranslated[key]; } } // Sort the keys translatedJSONs[languageCode] = Object.keys( translatedJSONs[languageCode], ) .sort() .reduce( (obj, key) => { obj[key] = translatedJSONs[languageCode][key]; return obj; }, {} as { [key: string]: string }, ); if (options.onLanguageComplete) { const unflattened = unflatten(translatedJSONs[languageCode], { delimiter: FLATTEN_DELIMITER, }) as Object; options.onLanguageComplete( languageCode, unflattened, translatedJSONs[languageCode], ); } } } const unflatToUpdateJSONs: { [language: string]: Object } = {}; for (const lang in translatedJSONs) { if (Object.prototype.hasOwnProperty.call(translatedJSONs, lang)) { unflatToUpdateJSONs[lang] = unflatten(translatedJSONs[lang], { delimiter: FLATTEN_DELIMITER, }); } } return unflatToUpdateJSONs; }