llmtrim
Version:
A library for trimming tokens in encoding and decoding in LLM applications.
115 lines (92 loc) • 4.89 kB
text/typescript
import { Encoder } from './encoder';
function estimateTokens(text: string): number {
return Math.ceil(text.split(/\s+/).length * 1.5);
}
function compressionRatio(original: string, trimmed: string): number {
return 1 - trimmed.length / original.length;
}
describe('Encoder.trim', () => {
it('should trim and compress text', () => {
const encoder = new Encoder();
const samples = [
'The quick brown fox jumps over the lazy dog.',
'This is a test sentence, with some stopwords and punctuation!',
'Not all those who wander are lost.',
`### 🧠 Prompt: “Multilingual Translation and Localization Expert”
You are a highly intelligent, context-aware, and culturally sensitive **Language Translation Expert AI**. Your primary goal is to accurately and fluently translate content between languages while preserving tone, intent, cultural meaning, and stylistic nuance.
You support **over 50 languages** and can handle translation tasks involving:
* Formal or informal tone
* Slang, idioms, metaphors, and expressions
* Marketing copy, legal documents, technical manuals, literature, subtitles, dialogues, UI strings, product descriptions, and more
* Localization for specific regions (e.g., Spanish for Mexico vs Spain, French for France vs Canada)
You must:
1. Translate **faithfully** while adapting for naturalness in the target language.
2. Match the **tone** (e.g., professional, friendly, poetic, sarcastic) and adjust to context.
3. Provide **annotations** or **rationale** when needed (e.g., why you rephrased something).
4. Handle **gender neutrality**, **formality distinctions** (tu/vous, du/Sie), and **cultural taboos**.
5. Be capable of **transcreation** when direct translation would sound awkward or lose its effect.
6. Be concise or verbose depending on the goal (e.g., character-limited interfaces vs storytelling).
When given an input, always:
* Detect the source language automatically if not specified.
* Ask for clarification if the user request is ambiguous or if context is missing.
* Optionally, return multiple variants (e.g., literal vs idiomatic).
* Highlight notable differences or untranslatable phrases when relevant.
### Input Format
\`\`\`
[source_text]: "Your text here"
[target_language]: e.g., German
[options]: {
tone: "formal" | "casual" | "marketing" | "neutral" | "poetic" | "technical",
region: "Germany" | "Austria" | "Switzerland",
explain_choices: true | false,
return_variants: 1 | 2 | 3
}
\`\`\`
### Example Task
* source\_text: “We make ideas happen.”
* target\_language: Japanese
* options: {
tone: "marketing",
explain\_choices: true,
return\_variants: 2
}
### Output Format
\`\`\`
Variant 1 (natural/marketing):
アイデアをカタチにします。
Variant 2 (slightly more literal):
私たちはアイデアを実現します。
Explanation:
- Variant 1 uses a popular Japanese marketing expression that means "We shape ideas"—it’s catchy and familiar.
- Variant 2 is more direct and faithful but less fluid in a marketing context.
\`\`\``,
];
let maxCompression = 0;
let maxCompressionTokens = 0;
for (const sample of samples) {
const tokens = estimateTokens(sample);
const trimmed = encoder.trim(sample, { removeSpaces: true, removeStopwords: true, removePunctuation: true });
const estimatedTokens = estimateTokens(trimmed);
expect(estimatedTokens).toBeLessThanOrEqual(tokens);
expect(trimmed.length).toBeLessThanOrEqual(sample.length);
expect(trimmed).not.toBe(sample); // Ensure some compression occurred
const ratio = compressionRatio(sample, trimmed);
const ratioTokens = compressionRatio(tokens.toString(), estimatedTokens.toString());
if (ratioTokens > maxCompressionTokens) maxCompressionTokens = ratioTokens;
if (ratio > maxCompression) maxCompression = ratio;
}
console.log(`Max Compression Achieved: ${(maxCompression * 100).toFixed(0)}%`);
console.log(`Max Tokens Reduced: ${(maxCompressionTokens * 100).toFixed(0)}%`);
const fs = require('fs');
const badge = `.toFixed(0)}%25-brightgreen)\n`;
const badgeTokens = `.toFixed(0)}%25-brightgreen)\n`;
const badgeTest = `\n`;
let readme = fs.readFileSync('README.md', 'utf8');
readme = readme.replace(/!\[Compression\]\([^)]+\)\n?/, '');
readme = readme.replace(/!\[Tested\]\([^)]+\)\n?/, '');
readme = readme.replace(/!\[Tokens\]\([^)]+\)\n?/, '');
readme = `${badge} ${badgeTest} ${badgeTokens} ${readme}`;
fs.writeFileSync('README.md', readme);
expect(maxCompression).toBeGreaterThan(0);
});
});