UNPKG

llmtrim

Version:

A library for trimming tokens in encoding and decoding in LLM applications.

118 lines (101 loc) 4.83 kB
import natural from 'natural'; import { TrimOptions } from './types'; import { PUNCTUATION, ARTICLES_PREPOSITIONS, NEGATION_WORDS } from './types/i18n'; /** * Encoder class for text processing * This class provides methods to trim text by removing stopwords, punctuation, and applying stemming. * It also handles special cases like code snippets and JSON blocks to preserve their integrity. * @class Encoder * @example * const encoder = new Encoder(); * const trimmedText = encoder.trim("The quick brown fox jumps over the lazy dog.", { stemmer: 'porter', removeStopwords: true }); */ export class Encoder { /** * Trims the input text by removing stopwords, punctuation, and applying stemming. * It also preserves code snippets and JSON blocks by replacing them with placeholders during processing. * @param text The input text/prompt to be trimmed. * @param options Options for trimming the text. * @param options.stemmer The stemming algorithm to use ('porter' or 'lancaster'). * @param options.language The language for stopwords and stemming (default is 'english'). * @param options.removeSpaces Whether to remove spaces between words (default is true). * @param options.removeStopwords Whether to remove stopwords (default is true). * @param options.removePunctuation Whether to remove punctuation (default is false). * @returns The trimmed text. * @throws Will throw an error if an unsupported stemmer is specified. */ trim(text: string, options: TrimOptions = {}): string { const { stemmer, language = 'english', removeSpaces = true, removeStopwords = true, removePunctuation = false, removeNewLines = false } = options; const protectedBlocks: string[] = []; let protectedIndex = 0; // Match JSON-like or code-like patterns const codeJsonRegex = /(```[\s\S]*?```|{[\s\S]*?}|\[[\s\S]*?]|\bfunction\s+\w+\s*\([^)]*\)\s*{[\s\S]*?})/g; text = text.replace(codeJsonRegex, (match) => { const placeholder = `__PROTECTED_BLOCK_${protectedIndex}__`; protectedBlocks.push(match); protectedIndex++; return placeholder; }); text = text.replace(/['’]/g, ''); if (removeNewLines) { text = text.replace(/[\r\n]+/g, ' '); } else { text = text.replace(/[\r\n]+/g, '__BREAK__'); } const tokenizer = new natural.WordTokenizer(); let tokenized = tokenizer.tokenize(text); if (removePunctuation) { tokenized = tokenized.filter((word: any) => !PUNCTUATION.includes(word)); } if (removeStopwords) { const stopwords = natural.stopwords; const wordsToExclude = new Set([...stopwords, ...(ARTICLES_PREPOSITIONS[language] || [])]); NEGATION_WORDS[language]?.forEach((neg) => wordsToExclude.delete(neg)); tokenized = tokenized.filter((word: string) => !wordsToExclude.has(word.toLowerCase())); } let words = tokenized; if (stemmer) { let stemmerInstance: natural.Stemmer; if (stemmer === 'porter') { stemmerInstance = natural.PorterStemmer; } else if (stemmer === 'lancaster') { stemmerInstance = natural.LancasterStemmer; } else { throw new Error('Unsupported stemmer'); } words = tokenized.map((word: any) => stemmerInstance.stem(word)); } const joinStr = removeSpaces ? '' : ' '; let trimmed = words.join(joinStr).trim(); if (!removePunctuation) { trimmed = trimmed.replace(/\s([?.!,:;])/g, '$1'); } // Restore protected blocks protectedBlocks.forEach((block, index) => { const placeholder = `__PROTECTED_BLOCK_${index}__`; trimmed = trimmed.replace(placeholder, block); }); // Replace <<br>> with actual new lines if not removing new lines if (!removeNewLines) { trimmed = trimmed.replace(/__BREAK__/g, '\n'); } return trimmed; } } /** * Trims the input text by removing stopwords, punctuation, and applying stemming. * It also preserves code snippets and JSON blocks by replacing them with placeholders during processing. * @param text The input text/prompt to be trimmed. * @param options Options for trimming the text. * @param options.stemmer The stemming algorithm to use ('porter' or 'lancaster'). * @param options.language The language for stopwords and stemming (default is 'english'). * @param options.removeSpaces Whether to remove spaces between words (default is true). * @param options.removeStopwords Whether to remove stopwords (default is true). * @param options.removePunctuation Whether to remove punctuation (default is false). * @returns The trimmed text. * @throws Will throw an error if an unsupported stemmer is specified. */ export function trim(text: string, options: TrimOptions = {}): string { const encoder = new Encoder(); return encoder.trim(text, options); }