@dvdagames/pgn-tokenizer

Version:

TypeScript version of PGN Tokenizer, a Byte Pair Encoding (BPE) tokenizer for Chess Portable Game Notiation (PGN).

github.com/DVDAGames/pgn-tokenizer/

DVDAGames/pgn-tokenizer

107 lines (106 loc) • 2.87 kB

TypeScript

/** * Types for PGN Tokenizer Configuration */ /** * Represents a token added to the tokenizer's vocabulary */ export interface AddedToken { /** Unique identifier for the token */ id: number; /** The actual content of the token */ content: string; /** Whether the token is a single word */ single_word: boolean; /** Whether to strip whitespace from the left */ lstrip: boolean; /** Whether to strip whitespace from the right */ rstrip: boolean; /** Whether the token is normalized */ normalized: boolean; /** Whether this is a special token */ special: boolean; } /** * Configuration for the normalizer */ export interface Normalizer { /** Type of normalization to apply */ type: string; } /** * Configuration for the pre-tokenizer */ export interface PreTokenizer { /** Type of pre-tokenizer */ type: string; /** Pattern configuration */ pattern: { /** Regular expression pattern */ Regex: string; }; /** Behavior of the pre-tokenizer */ behavior: string; /** Whether to invert the pattern matching */ invert: boolean; } /** * Configuration for byte-level processing */ export interface PostProcessorConfig { /** Type of processor */ type: string; /** Whether to add prefix space */ add_prefix_space: boolean; /** Whether to trim offsets */ trim_offsets: boolean; /** Whether to use regex */ use_regex: boolean; } /** * Configuration for the BPE model */ export interface ModelConfig { /** Type of the model */ type: string; /** Dropout rate */ dropout: number | null; /** Token to use for unknown tokens */ unk_token: string; /** Prefix for continuing subwords */ continuing_subword_prefix: string | null; /** Suffix for end of words */ end_of_word_suffix: string | null; /** Whether to fuse unknown tokens */ fuse_unk: boolean; /** Whether to use byte fallback */ byte_fallback: boolean; /** Whether to ignore merges */ ignore_merges: boolean; /** Vocabulary mapping */ vocab: Record<string, number>; /** Merge rules */ merges: string[][]; } /** * Main configuration type for the PGN tokenizer */ export interface PGNTokenizerConfig { /** Version of the tokenizer configuration */ version: string; /** Truncation configuration */ truncation: null; /** Padding configuration */ padding: null; /** List of added tokens */ added_tokens: AddedToken[]; /** Normalizer configuration */ normalizer: Normalizer; /** Pre-tokenizer configuration */ pre_tokenizer: PreTokenizer; /** Post-processor configuration */ post_processor: PostProcessorConfig; /** Decoder configuration */ decoder: PostProcessorConfig; /** Model configuration */ model: ModelConfig; }