UNPKG

i18n-ai-translate

Version:

AI-powered localization CLI, Node library, and GitHub Action. Translate i18next JSON, Gettext PO, Java .properties, and iOS .strings with ChatGPT, Claude, Gemini, or local Ollama models.

352 lines (315 loc) 13.6 kB
import { DEFAULT_RULE, getPluralRule } from "./po_plural_rules"; import { po } from "gettext-parser"; import type { GetTextComment, GetTextTranslations } from "gettext-parser"; import type FormatAdapter from "./format_adapter"; /** * ASCII Record Separator — used to join msgctxt, msgid, and optional * plural-suffix segments into a single flat key. Chosen for the same * reason `DIRECTORY_KEY_DELIMITER` was picked elsewhere: no legal PO * content can contain a control character, so round-trip is unambiguous. */ const KEY_DELIMITER = "\x1e"; /** * printf-style placeholder: `%s`, `%d`, `%1$s`, `%2$.2f`, … Capture * groups: (1) optional `N$` positional index. * * Deliberately tolerant — the adapter's contract is "whatever we strip * on read, we restore on write"; we don't need to fully validate the * printf spec. */ const PRINTF_REGEX = /%(?:(\d+)\$)?[-+ #0]*\d*(?:\.\d+)?(?:hh|h|ll|l|j|z|t|L)?[sdifFeEgGxXoucpn%]/g; type PlaceholderMap = { /** Ordered list of native tokens, one per arg slot. */ tokens: string[]; /** Whether the entry used positional (`%1$s`) or bare (`%s`) tokens. */ positional: boolean; }; type POEntryMeta = { msgctxt?: string; msgid: string; isPlural: boolean; /** Original comments; preserved on write. */ comments?: GetTextComment; placeholders?: PlaceholderMap; }; /** * Everything we need to reconstruct the PO file after the flat map * round-trips through the pipeline. The parsed table is kept so header * fields, ordering, obsolete entries, and unreferenced comments all * survive a write. */ type POSidecar = { kind: "po"; parsed: GetTextTranslations; /** Keyed by the same flat key used by the pipeline. */ metaByKey: Record<string, POEntryMeta>; /** Source language's plural category list, captured at read time. */ sourceCategories: readonly string[]; }; function makeKey( msgctxt: string | undefined, msgid: string, suffix?: string, ): string { const ctx = msgctxt ?? ""; return suffix ? `${ctx}${KEY_DELIMITER}${msgid}${KEY_DELIMITER}${suffix}` : `${ctx}${KEY_DELIMITER}${msgid}`; } function stripPlaceholders(text: string): { normalized: string; map: PlaceholderMap; } { const tokens: string[] = []; let positional = false; let autoIndex = 0; const normalized = text.replace(PRINTF_REGEX, (match, posIdx) => { if (match === "%%") return match; let index: number; if (posIdx) { positional = true; index = Number(posIdx); } else { autoIndex++; index = autoIndex; } // Array positions are 0-based but arg indices are 1-based; keep // them aligned so a re-read matches on the same index. tokens[index - 1] = match; return `{{arg${index}}}`; }); return { map: { positional, tokens }, normalized }; } function restorePlaceholders(text: string, map?: PlaceholderMap): string { if (!map || map.tokens.length === 0) return text; return text.replace(/\{\{arg(\d+)\}\}/g, (_match, idx) => { const original = map.tokens[Number(idx) - 1]; // If the model invented an extra arg reference, leave the // placeholder literal — surfacing it is better than silently // deleting it. The verification step already guards this. return original ?? `{{arg${idx}}}`; }); } /** * Resolve the source language's plural category list from the PO * header. Falls back to English-style two-form if the header is * missing or unrecognized. * @param headers - the parsed PO header map * @returns the source language's ordered plural category list */ function inferSourceCategories( headers: Record<string, string>, ): readonly string[] { const lang = headers["Language"] ?? headers["language"] ?? ""; const code = lang.toLowerCase().split(/[-_]/)[0]; if (!code) return DEFAULT_RULE.categories; return getPluralRule(code).categories; } const POAdapter: FormatAdapter<POSidecar> = { extensions: [".po"] as const, name: "po", read(raw: string): { flat: Record<string, string>; sidecar: POSidecar } { const parsed = po.parse(raw); const sourceCategories = inferSourceCategories(parsed.headers); const flat: Record<string, string> = {}; const metaByKey: Record<string, POEntryMeta> = {}; for (const ctx of Object.keys(parsed.translations)) { const bucket = parsed.translations[ctx]; for (const msgid of Object.keys(bucket)) { const entry = bucket[msgid]; // The PO header is stored as the empty-msgid entry in // the empty-context bucket; skip it from the flat map. if (ctx === "" && msgid === "") continue; if (entry.msgid_plural) { const msgids = [entry.msgid, entry.msgid_plural]; for (let i = 0; i < msgids.length; i++) { // Source-language PO files typically only have // msgstr[0] / msgstr[1] populated when acting // as the *source*; we actually want to expose // the English msgid / msgid_plural for the // pipeline to translate, since msgstr is the // target-language slot. For a pristine source // PO (msgstr empty) this is the only sensible // shape anyway. const text = msgids[i]; const suffix = i === 0 ? "_one" : "_other"; const { normalized, map } = stripPlaceholders(text); const key = makeKey(entry.msgctxt, entry.msgid, suffix); flat[key] = normalized; metaByKey[key] = { comments: entry.comments, isPlural: true, msgctxt: entry.msgctxt, msgid: entry.msgid, placeholders: map, }; } } else { const { normalized, map } = stripPlaceholders(entry.msgid); const key = makeKey(entry.msgctxt, entry.msgid); flat[key] = normalized; metaByKey[key] = { comments: entry.comments, isPlural: false, msgctxt: entry.msgctxt, msgid: entry.msgid, placeholders: map, }; } } } return { flat, sidecar: { kind: "po", metaByKey, parsed, sourceCategories, }, }; }, readTranslated(raw: string): { flat: Record<string, string>; sidecar: POSidecar; } { // Diff mode: read an existing *target* catalogue, exposing the // translated `msgstr` values (not `msgid`) keyed exactly as // `read` keys the source, so unchanged keys line up and survive. // Values are kept verbatim (placeholders not normalized): on // write they pass through restorePlaceholders, which is a no-op // when no `{{argN}}` tokens are present. const parsed = po.parse(raw); const sourceCategories = inferSourceCategories(parsed.headers); const oneIndex = Math.max(0, sourceCategories.indexOf("one")); const otherIndex = sourceCategories.findIndex((c) => c !== "one"); const flat: Record<string, string> = {}; const metaByKey: Record<string, POEntryMeta> = {}; for (const ctx of Object.keys(parsed.translations)) { const bucket = parsed.translations[ctx]; for (const msgid of Object.keys(bucket)) { const entry = bucket[msgid]; if (ctx === "" && msgid === "") continue; if (entry.msgid_plural) { const oneKey = makeKey(entry.msgctxt, entry.msgid, "_one"); const otherKey = makeKey( entry.msgctxt, entry.msgid, "_other", ); flat[oneKey] = entry.msgstr[oneIndex] ?? ""; flat[otherKey] = entry.msgstr[otherIndex >= 0 ? otherIndex : 0] ?? ""; metaByKey[oneKey] = { comments: entry.comments, isPlural: true, msgctxt: entry.msgctxt, msgid: entry.msgid, }; metaByKey[otherKey] = metaByKey[oneKey]; } else { const key = makeKey(entry.msgctxt, entry.msgid); flat[key] = entry.msgstr[0] ?? ""; metaByKey[key] = { comments: entry.comments, isPlural: false, msgctxt: entry.msgctxt, msgid: entry.msgid, }; } } } return { flat, sidecar: { kind: "po", metaByKey, parsed, sourceCategories }, }; }, write( translated: Record<string, string>, sidecar: POSidecar, _inputLanguageCode: string, outputLanguageCode: string, ): string { const targetRule = getPluralRule(outputLanguageCode); // Deep-clone the parsed table so we don't mutate the sidecar // held by the caller (translateDiff may reuse it). const out: GetTextTranslations = { charset: sidecar.parsed.charset, headers: { ...sidecar.parsed.headers }, obsolete: sidecar.parsed.obsolete, translations: {}, }; // Update the header to reflect the target language's plurals. out.headers["Plural-Forms"] = targetRule.forms; out.headers["Language"] = outputLanguageCode; for (const ctx of Object.keys(sidecar.parsed.translations)) { out.translations[ctx] = {}; const bucket = sidecar.parsed.translations[ctx]; for (const msgid of Object.keys(bucket)) { const original = bucket[msgid]; if (ctx === "" && msgid === "") { // Preserve the PO header entry verbatim. out.translations[ctx][msgid] = { ...original, // gettext-parser expects msgstr to be an array. msgstr: original.msgstr, }; continue; } if (original.msgid_plural) { const oneKey = makeKey( original.msgctxt, original.msgid, "_one", ); const otherKey = makeKey( original.msgctxt, original.msgid, "_other", ); const oneMeta = sidecar.metaByKey[oneKey]; const otherMeta = sidecar.metaByKey[otherKey]; const translatedOne = translated[oneKey]; const translatedOther = translated[otherKey]; // Fan-in: re-expand the two i18next plural slots // (`_one` / `_other`) into the target language's // full msgstr[] array. For a language with >2 // forms we clone the `_other` slot into every // non-`one` category; it's the honest v1 behavior // given i18next's own two-form plural marking. // Each source slot carries its own placeholder map // (the singular and plural forms can differ), so // restore each pick with the matching one. const msgstr: string[] = []; for (let i = 0; i < targetRule.nplurals; i++) { const isOne = targetRule.categories[i] === "one"; const pick = isOne ? translatedOne : translatedOther; const map = isOne ? oneMeta?.placeholders : otherMeta?.placeholders; msgstr.push(restorePlaceholders(pick ?? "", map)); } out.translations[ctx][msgid] = { ...original, msgstr, }; } else { const key = makeKey(original.msgctxt, original.msgid); const meta = sidecar.metaByKey[key]; const translatedText = translated[key] ?? ""; out.translations[ctx][msgid] = { ...original, msgstr: [ restorePlaceholders( translatedText, meta?.placeholders, ), ], }; } } } return po.compile(out).toString("utf-8"); }, }; export default POAdapter;