i18n-ai-translate
Version:
AI-powered localization CLI, Node library, and GitHub Action. Translate i18next JSON, Gettext PO, Java .properties, and iOS .strings with ChatGPT, Claude, Gemini, or local Ollama models.
352 lines (315 loc) • 13.6 kB
text/typescript
import { DEFAULT_RULE, getPluralRule } from "./po_plural_rules";
import { po } from "gettext-parser";
import type { GetTextComment, GetTextTranslations } from "gettext-parser";
import type FormatAdapter from "./format_adapter";
/**
* ASCII Record Separator — used to join msgctxt, msgid, and optional
* plural-suffix segments into a single flat key. Chosen for the same
* reason `DIRECTORY_KEY_DELIMITER` was picked elsewhere: no legal PO
* content can contain a control character, so round-trip is unambiguous.
*/
const KEY_DELIMITER = "\x1e";
/**
* printf-style placeholder: `%s`, `%d`, `%1$s`, `%2$.2f`, … Capture
* groups: (1) optional `N$` positional index.
*
* Deliberately tolerant — the adapter's contract is "whatever we strip
* on read, we restore on write"; we don't need to fully validate the
* printf spec.
*/
const PRINTF_REGEX =
/%(?:(\d+)\$)?[-+ #0]*\d*(?:\.\d+)?(?:hh|h|ll|l|j|z|t|L)?[sdifFeEgGxXoucpn%]/g;
type PlaceholderMap = {
/** Ordered list of native tokens, one per arg slot. */
tokens: string[];
/** Whether the entry used positional (`%1$s`) or bare (`%s`) tokens. */
positional: boolean;
};
type POEntryMeta = {
msgctxt?: string;
msgid: string;
isPlural: boolean;
/** Original comments; preserved on write. */
comments?: GetTextComment;
placeholders?: PlaceholderMap;
};
/**
* Everything we need to reconstruct the PO file after the flat map
* round-trips through the pipeline. The parsed table is kept so header
* fields, ordering, obsolete entries, and unreferenced comments all
* survive a write.
*/
type POSidecar = {
kind: "po";
parsed: GetTextTranslations;
/** Keyed by the same flat key used by the pipeline. */
metaByKey: Record<string, POEntryMeta>;
/** Source language's plural category list, captured at read time. */
sourceCategories: readonly string[];
};
function makeKey(
msgctxt: string | undefined,
msgid: string,
suffix?: string,
): string {
const ctx = msgctxt ?? "";
return suffix
? `${ctx}${KEY_DELIMITER}${msgid}${KEY_DELIMITER}${suffix}`
: `${ctx}${KEY_DELIMITER}${msgid}`;
}
function stripPlaceholders(text: string): {
normalized: string;
map: PlaceholderMap;
} {
const tokens: string[] = [];
let positional = false;
let autoIndex = 0;
const normalized = text.replace(PRINTF_REGEX, (match, posIdx) => {
if (match === "%%") return match;
let index: number;
if (posIdx) {
positional = true;
index = Number(posIdx);
} else {
autoIndex++;
index = autoIndex;
}
// Array positions are 0-based but arg indices are 1-based; keep
// them aligned so a re-read matches on the same index.
tokens[index - 1] = match;
return `{{arg${index}}}`;
});
return { map: { positional, tokens }, normalized };
}
function restorePlaceholders(text: string, map?: PlaceholderMap): string {
if (!map || map.tokens.length === 0) return text;
return text.replace(/\{\{arg(\d+)\}\}/g, (_match, idx) => {
const original = map.tokens[Number(idx) - 1];
// If the model invented an extra arg reference, leave the
// placeholder literal — surfacing it is better than silently
// deleting it. The verification step already guards this.
return original ?? `{{arg${idx}}}`;
});
}
/**
* Resolve the source language's plural category list from the PO
* header. Falls back to English-style two-form if the header is
* missing or unrecognized.
* @param headers - the parsed PO header map
* @returns the source language's ordered plural category list
*/
function inferSourceCategories(
headers: Record<string, string>,
): readonly string[] {
const lang = headers["Language"] ?? headers["language"] ?? "";
const code = lang.toLowerCase().split(/[-_]/)[0];
if (!code) return DEFAULT_RULE.categories;
return getPluralRule(code).categories;
}
const POAdapter: FormatAdapter<POSidecar> = {
extensions: [".po"] as const,
name: "po",
read(raw: string): { flat: Record<string, string>; sidecar: POSidecar } {
const parsed = po.parse(raw);
const sourceCategories = inferSourceCategories(parsed.headers);
const flat: Record<string, string> = {};
const metaByKey: Record<string, POEntryMeta> = {};
for (const ctx of Object.keys(parsed.translations)) {
const bucket = parsed.translations[ctx];
for (const msgid of Object.keys(bucket)) {
const entry = bucket[msgid];
// The PO header is stored as the empty-msgid entry in
// the empty-context bucket; skip it from the flat map.
if (ctx === "" && msgid === "") continue;
if (entry.msgid_plural) {
const msgids = [entry.msgid, entry.msgid_plural];
for (let i = 0; i < msgids.length; i++) {
// Source-language PO files typically only have
// msgstr[0] / msgstr[1] populated when acting
// as the *source*; we actually want to expose
// the English msgid / msgid_plural for the
// pipeline to translate, since msgstr is the
// target-language slot. For a pristine source
// PO (msgstr empty) this is the only sensible
// shape anyway.
const text = msgids[i];
const suffix = i === 0 ? "_one" : "_other";
const { normalized, map } = stripPlaceholders(text);
const key = makeKey(entry.msgctxt, entry.msgid, suffix);
flat[key] = normalized;
metaByKey[key] = {
comments: entry.comments,
isPlural: true,
msgctxt: entry.msgctxt,
msgid: entry.msgid,
placeholders: map,
};
}
} else {
const { normalized, map } = stripPlaceholders(entry.msgid);
const key = makeKey(entry.msgctxt, entry.msgid);
flat[key] = normalized;
metaByKey[key] = {
comments: entry.comments,
isPlural: false,
msgctxt: entry.msgctxt,
msgid: entry.msgid,
placeholders: map,
};
}
}
}
return {
flat,
sidecar: {
kind: "po",
metaByKey,
parsed,
sourceCategories,
},
};
},
readTranslated(raw: string): {
flat: Record<string, string>;
sidecar: POSidecar;
} {
// Diff mode: read an existing *target* catalogue, exposing the
// translated `msgstr` values (not `msgid`) keyed exactly as
// `read` keys the source, so unchanged keys line up and survive.
// Values are kept verbatim (placeholders not normalized): on
// write they pass through restorePlaceholders, which is a no-op
// when no `{{argN}}` tokens are present.
const parsed = po.parse(raw);
const sourceCategories = inferSourceCategories(parsed.headers);
const oneIndex = Math.max(0, sourceCategories.indexOf("one"));
const otherIndex = sourceCategories.findIndex((c) => c !== "one");
const flat: Record<string, string> = {};
const metaByKey: Record<string, POEntryMeta> = {};
for (const ctx of Object.keys(parsed.translations)) {
const bucket = parsed.translations[ctx];
for (const msgid of Object.keys(bucket)) {
const entry = bucket[msgid];
if (ctx === "" && msgid === "") continue;
if (entry.msgid_plural) {
const oneKey = makeKey(entry.msgctxt, entry.msgid, "_one");
const otherKey = makeKey(
entry.msgctxt,
entry.msgid,
"_other",
);
flat[oneKey] = entry.msgstr[oneIndex] ?? "";
flat[otherKey] =
entry.msgstr[otherIndex >= 0 ? otherIndex : 0] ?? "";
metaByKey[oneKey] = {
comments: entry.comments,
isPlural: true,
msgctxt: entry.msgctxt,
msgid: entry.msgid,
};
metaByKey[otherKey] = metaByKey[oneKey];
} else {
const key = makeKey(entry.msgctxt, entry.msgid);
flat[key] = entry.msgstr[0] ?? "";
metaByKey[key] = {
comments: entry.comments,
isPlural: false,
msgctxt: entry.msgctxt,
msgid: entry.msgid,
};
}
}
}
return {
flat,
sidecar: { kind: "po", metaByKey, parsed, sourceCategories },
};
},
write(
translated: Record<string, string>,
sidecar: POSidecar,
_inputLanguageCode: string,
outputLanguageCode: string,
): string {
const targetRule = getPluralRule(outputLanguageCode);
// Deep-clone the parsed table so we don't mutate the sidecar
// held by the caller (translateDiff may reuse it).
const out: GetTextTranslations = {
charset: sidecar.parsed.charset,
headers: { ...sidecar.parsed.headers },
obsolete: sidecar.parsed.obsolete,
translations: {},
};
// Update the header to reflect the target language's plurals.
out.headers["Plural-Forms"] = targetRule.forms;
out.headers["Language"] = outputLanguageCode;
for (const ctx of Object.keys(sidecar.parsed.translations)) {
out.translations[ctx] = {};
const bucket = sidecar.parsed.translations[ctx];
for (const msgid of Object.keys(bucket)) {
const original = bucket[msgid];
if (ctx === "" && msgid === "") {
// Preserve the PO header entry verbatim.
out.translations[ctx][msgid] = {
...original,
// gettext-parser expects msgstr to be an array.
msgstr: original.msgstr,
};
continue;
}
if (original.msgid_plural) {
const oneKey = makeKey(
original.msgctxt,
original.msgid,
"_one",
);
const otherKey = makeKey(
original.msgctxt,
original.msgid,
"_other",
);
const oneMeta = sidecar.metaByKey[oneKey];
const otherMeta = sidecar.metaByKey[otherKey];
const translatedOne = translated[oneKey];
const translatedOther = translated[otherKey];
// Fan-in: re-expand the two i18next plural slots
// (`_one` / `_other`) into the target language's
// full msgstr[] array. For a language with >2
// forms we clone the `_other` slot into every
// non-`one` category; it's the honest v1 behavior
// given i18next's own two-form plural marking.
// Each source slot carries its own placeholder map
// (the singular and plural forms can differ), so
// restore each pick with the matching one.
const msgstr: string[] = [];
for (let i = 0; i < targetRule.nplurals; i++) {
const isOne = targetRule.categories[i] === "one";
const pick = isOne ? translatedOne : translatedOther;
const map = isOne
? oneMeta?.placeholders
: otherMeta?.placeholders;
msgstr.push(restorePlaceholders(pick ?? "", map));
}
out.translations[ctx][msgid] = {
...original,
msgstr,
};
} else {
const key = makeKey(original.msgctxt, original.msgid);
const meta = sidecar.metaByKey[key];
const translatedText = translated[key] ?? "";
out.translations[ctx][msgid] = {
...original,
msgstr: [
restorePlaceholders(
translatedText,
meta?.placeholders,
),
],
};
}
}
}
return po.compile(out).toString("utf-8");
},
};
export default POAdapter;