crowdin-context-harvester
Version:
Crowdin AI Context Harvester CLI
381 lines (325 loc) • 11.3 kB
JavaScript
//@ts-check
import chalk from 'chalk';
import cliWidth from 'cli-width';
import fs from 'fs';
import { Parser } from 'json2csv';
import ora from 'ora';
import { table } from 'table';
import {
getCrowdin,
getUserId,
getCrowdinStrings,
validateAiProviderFields,
getTokenizer,
getPrompt,
getStringsChunks,
getModelLimits, getAiClient, stringifyStrings
} from './utils.js';
import {generateText, tool} from "ai";
import {z} from 'zod';
// tools that are used in the AI model. this way we get more predictable results from the model
const AI_TOOLS = [{
"name": "gradeStringContext",
"description": "Use this function to grade string context.",
"strict": false,
"parameters": {
"type": "object",
"properties": {
"id": {
"type": "number",
"description": "This is the ID of the string"
},
"sufficient": {
"type": "boolean",
"description": "Property that indicate if string context is sufficient for high quality translation"
},
"error": {
"type": "string",
"description": "Error that describe problems with provided context"
}
},
"required": [
"id",
"sufficient",
"error"
]
}
}];
const DEFAULT_PROMPT = `You should help the translator to grade the context of strings.
To be sufficient, the context should:
- describe exact meaning of the string's text, without ambiguity;
- help the translator to provide high quality translation of the string's text for each project target language.
For each string:
- analyze string text and context;
- grade string context.
Project target languages: %targetLanguages%.
Strings (serialised as JSON):
%strings%
`;
const spinner = ora();
/**
* Prints the strings that would be updated in a dry run
*
* @param {Array<object>} strings
*/
function dryRunPrint(strings) {
const stringsWithErrors = strings.filter((string) => string.errors);
const terminalWidth = cliWidth();
// Calculate the width for each column
const idColumnWidth = Math.ceil(terminalWidth * 0.13);
const textColumnWidth = Math.ceil(terminalWidth * 0.26);
const contextColumnWidth = Math.ceil(terminalWidth * 0.26);
const errorColumnWidth = Math.ceil(terminalWidth * 0.26);
const config = {
header: {
alignment: 'center',
content: 'Strings with errors'
},
columns: [
{
width: idColumnWidth,
wrapWord: true
},
{
width: textColumnWidth,
wrapWord: true
},
{
width: contextColumnWidth,
wrapWord: true
},
{
width: errorColumnWidth,
wrapWord: true
}
]
};
let data = [];
for (const string of stringsWithErrors) {
data.push([string.identifier, string.text, string.context, string.errors.join('\n')]);
}
if (data.length < 1) {
console.log(`\nNo strings with insufficient context found.\n`);
return;
}
console.log('\n');
//@ts-ignore
console.log(table(data, config));
console.log(`\n${stringsWithErrors.length} strings have context errors. Please be aware that an LLM model may return different results for the same input next time you run the tool.\n`);
}
/**
* Writes the strings with AI context to a CSV file
*
* @param {object} options
* @param {Array<object>} strings
*/
function writeCsv(options, strings) {
const csvFile = options.csvFile;
const stringsWithErrors = strings.filter((string) => string.errors);
const data = stringsWithErrors.map((string) => {
return {
id: string.id,
key: string.identifier,
text: string.text,
context: string.context,
errors: string.errors.join('\n'),
};
});
if (data.length < 1) {
console.log(`\nNo strings with insufficient context found.\n`);
return;
}
try {
const parser = new Parser({ fields: ['id', 'key', 'text', 'context', 'errors'] });
const csv = parser.parse(data);
fs.writeFileSync(csvFile, csv);
console.log(`\n${data.length} strings saved to ${chalk.green(csvFile)}\n`);
} catch (err) {
console.error(`Error writing CSV file: ${err}`);
}
}
/**
* @param {Array<object>} strings
* @param {object} [checkResults]
*/
async function appendCheckResults(strings, checkResults) {
for (const result of checkResults?.errors || []) {
const string = strings.find((s) => s.id === result.id);
if (string && result?.error) {
if (!string.errors) {
string.errors = [];
}
string.errors.push(result?.error);
}
}
}
/**
* Chunks the strings and code into smaller parts if needed and sends them to the AI model
*
* @param {object} param0
* @param {object} param0.apiClient
* @param {object} param0.options
* @param {Array<object>} param0.crowdinStrings
*/
async function checkStringsContext({ apiClient, options, crowdinStrings }) {
// if there are no strings left after screening, we return an empty context
if (!crowdinStrings.length) {
console.log(`${chalk.gray(' No strings found.')}`)
return { errors: [] };
}
const project = (await apiClient.projectsGroupsApi.getProject(options.project)).data;
const languages = (await apiClient.languagesApi.withFetchAll().listSupportedLanguages()).data;
const targetLanguageNames = project.targetLanguageIds.map(id => languages.find(({ data }) => data.id === id)?.data?.name).filter(a => !!a);
const tokenizer = getTokenizer(options.ai, options.model);
const prompt = getPrompt({ options, defaultPrompt: DEFAULT_PROMPT });
const modeLimits = getModelLimits(options);
const stringsChunkLimit = modeLimits.output / 4; // we assume that context will be longer than strings
const stringsChunks = getStringsChunks({
crowdinStrings,
tokenizer,
chunkLimit: stringsChunkLimit
});
let chunkNumber = 1;
let errors = [];
for (const stringsChunk of stringsChunks) {
spinner.start(`Processing chunk ${chunkNumber} of ${stringsChunks.length}`);
try {
const messages = buildMessages({ prompt, strings: stringsChunk, targetLanguageNames });
const response = await executePrompt({
apiClient,
messages,
options,
});
errors.push(...(response?.errors || []));
spinner.succeed();
} catch (e) {
spinner.fail();
console.log(`\n${e?.response?.data?.error?.message || e}`);
}
chunkNumber++;
}
return { errors };
}
/**
* Builds the chat messages for the AI model
*
* @param {object} param0
* @param {object} param0.prompt
* @param {object} param0.strings
*/
function buildMessages({ prompt, strings, targetLanguageNames }) {
const builtPrompt = prompt.replace('%strings%', stringifyStrings({ strings })).replace('%targetLanguages%', targetLanguageNames.join(', '));
return [
{
role: 'system',
content: 'You are helpful translator\'s assistant.',
},
{
role: 'user',
content: builtPrompt,
}
];
}
/**
* Picks a preferred AI provider and executes the prompt
* Returns an array of objects, every object is a string id and extracted context
*
* @param {object} param0
* @param {object} param0.apiClient
* @param {object} param0.options
* @param {Array<object>} param0.messages
*/
async function executePrompt({ apiClient, options, messages }) {
if (options.ai === 'crowdin') {
let aiResponse;
if (apiClient.aiApi.organization) {
aiResponse = (await apiClient.aiApi.createAiOrganizationProxyChatCompletion(options.crowdinAiId, {
model: options.model,
messages,
tools: AI_TOOLS
}));
} else {
aiResponse = (await apiClient.aiApi.createAiUserProxyChatCompletion(await getUserId(apiClient), options.crowdinAiId, {
model: options.model,
messages,
tools: AI_TOOLS
}));
}
const errors = [];
(aiResponse?.data?.choices?.[0]?.message?.tool_calls || []).forEach(toolCall => {
const args = toolCall?.function?.arguments;
if (args) {
errors.push(JSON.parse(args));
}
})
return { errors };
}
let client;
try {
client = getAiClient(options);
} catch(e) {
console.error('\n\nInvalid AI provider');
console.error(e);
process.exit(1);
}
const result = await generateText({
model: client(options.ai === 'azure' ? options.azureDeploymentName : options.model),
tools: {
gradeStringContext: tool({
description: 'Use this function to grade string context.',
parameters: z.object({
id: z.number().describe('This is the ID of the string'),
sufficient: z.boolean().describe('Property that indicate if string context is sufficient for high quality translation'),
error: z.string().describe('Error that describe problems with provided context'),
}),
}),
},
system: messages[0].content,
messages: [messages[1]],
});
let errors = [];
(result?.toolCalls || []).forEach(toolCall => {
errors.push(
toolCall.args
);
})
return { errors };
}
// main function that orchestrates the context check process
async function check(_name, commandOptions, _command) {
try {
const options = commandOptions.opts();
if (!['terminal', 'csv'].includes(options.output)) {
console.error('Wrong value provided for --output option. terminal, csv and crowdin values are available.');
process.exit();
}
validateAiProviderFields(options);
const apiClient = await getCrowdin(options);
let strings = await getCrowdinStrings({ options, apiClient, spinner });
let checkResults = {};
try {
checkResults = await checkStringsContext({
apiClient,
crowdinStrings: strings,
options,
});
} catch (e) {
console.log('\nError during context check');
console.error(e);
}
try {
await appendCheckResults(strings, checkResults);
} catch (error) {
console.log('\nError during context check');
console.error(error);
}
if (options.output === 'csv') {
writeCsv(options, strings);
} else {
dryRunPrint(strings);
}
} catch (error) {
console.error('error:', error);
}
}
export default check;