llmtrim
Version:
A library for trimming tokens in encoding and decoding in LLM applications.
55 lines (44 loc) • 1.72 kB
text/typescript
import natural from 'natural';
import { TrimOptions } from './types';
import { PUNCTUATION, ARTICLES_PREPOSITIONS, NEGATION_WORDS } from './types/i18n';
export class Decoder {
detrim(text: string, options: TrimOptions): string {
const { stemmer, language = 'english', removeSpaces = true, removeStopwords = true, removePunctuation = false } = options;
// Merge contractions
text = text.replace(/['’]/g, '');
// Tokenize
const tokenizer = new natural.WordTokenizer();
let tokenized = tokenizer.tokenize(text);
// Remove punctuation
if (removePunctuation) {
tokenized = tokenized.filter((word: any) => !PUNCTUATION.includes(word));
}
// Remove stopwords
if (removeStopwords) {
const stopwords = natural.stopwords;
const wordsToExclude = new Set([...stopwords, ...(ARTICLES_PREPOSITIONS[language] || [])]);
NEGATION_WORDS[language]?.forEach((neg) => wordsToExclude.delete(neg));
tokenized = tokenized.filter((word: string) => !wordsToExclude.has(word.toLowerCase()));
}
// Stemming
let words = tokenized;
if (stemmer) {
let stemmerInstance: natural.Stemmer;
if (stemmer === 'porter') {
stemmerInstance = natural.PorterStemmer;
} else if (stemmer === 'lancaster') {
stemmerInstance = natural.LancasterStemmer;
} else {
throw new Error('Unsupported stemmer');
}
words = tokenized.map((word: any) => stemmerInstance.stem(word));
}
// Remove spaces
const joinStr = removeSpaces ? '' : ' ';
let trimmed = words.join(joinStr).trim();
if (!removePunctuation) {
trimmed = trimmed.replace(/\s([?.!,:;])/g, '$1');
}
return trimmed;
}
}