@dvdagames/pgn-tokenizer
Version:
TypeScript version of PGN Tokenizer, a Byte Pair Encoding (BPE) tokenizer for Chess Portable Game Notiation (PGN).
107 lines (106 loc) • 2.87 kB
TypeScript
/**
* Types for PGN Tokenizer Configuration
*/
/**
* Represents a token added to the tokenizer's vocabulary
*/
export interface AddedToken {
/** Unique identifier for the token */
id: number;
/** The actual content of the token */
content: string;
/** Whether the token is a single word */
single_word: boolean;
/** Whether to strip whitespace from the left */
lstrip: boolean;
/** Whether to strip whitespace from the right */
rstrip: boolean;
/** Whether the token is normalized */
normalized: boolean;
/** Whether this is a special token */
special: boolean;
}
/**
* Configuration for the normalizer
*/
export interface Normalizer {
/** Type of normalization to apply */
type: string;
}
/**
* Configuration for the pre-tokenizer
*/
export interface PreTokenizer {
/** Type of pre-tokenizer */
type: string;
/** Pattern configuration */
pattern: {
/** Regular expression pattern */
Regex: string;
};
/** Behavior of the pre-tokenizer */
behavior: string;
/** Whether to invert the pattern matching */
invert: boolean;
}
/**
* Configuration for byte-level processing
*/
export interface PostProcessorConfig {
/** Type of processor */
type: string;
/** Whether to add prefix space */
add_prefix_space: boolean;
/** Whether to trim offsets */
trim_offsets: boolean;
/** Whether to use regex */
use_regex: boolean;
}
/**
* Configuration for the BPE model
*/
export interface ModelConfig {
/** Type of the model */
type: string;
/** Dropout rate */
dropout: number | null;
/** Token to use for unknown tokens */
unk_token: string;
/** Prefix for continuing subwords */
continuing_subword_prefix: string | null;
/** Suffix for end of words */
end_of_word_suffix: string | null;
/** Whether to fuse unknown tokens */
fuse_unk: boolean;
/** Whether to use byte fallback */
byte_fallback: boolean;
/** Whether to ignore merges */
ignore_merges: boolean;
/** Vocabulary mapping */
vocab: Record<string, number>;
/** Merge rules */
merges: string[][];
}
/**
* Main configuration type for the PGN tokenizer
*/
export interface PGNTokenizerConfig {
/** Version of the tokenizer configuration */
version: string;
/** Truncation configuration */
truncation: null;
/** Padding configuration */
padding: null;
/** List of added tokens */
added_tokens: AddedToken[];
/** Normalizer configuration */
normalizer: Normalizer;
/** Pre-tokenizer configuration */
pre_tokenizer: PreTokenizer;
/** Post-processor configuration */
post_processor: PostProcessorConfig;
/** Decoder configuration */
decoder: PostProcessorConfig;
/** Model configuration */
model: ModelConfig;
}