UNPKG

@sutton-signwriting/core

Version:

a javascript package for node and browsers that supports general processing of the Sutton SignWriting script

336 lines (290 loc) 10.4 kB
import { parse } from './fsw-parse'; /** * Default special tokens configuration * ``` * DEFAULT_SPECIAL_TOKENS = [ * { index: 0, name: 'UNK', value: '[UNK]' }, * { index: 1, name: 'PAD', value: '[PAD]' }, * { index: 2, name: 'CLS', value: '[CLS]' }, * { index: 3, name: 'SEP', value: '[SEP]' } * ]; * ``` */ const DEFAULT_SPECIAL_TOKENS = [ { index: 0, name: 'UNK', value: '[UNK]' }, { index: 1, name: 'PAD', value: '[PAD]' }, { index: 2, name: 'CLS', value: '[CLS]' }, { index: 3, name: 'SEP', value: '[SEP]' } ]; /** * Generates an array of all possible tokens for the FSW tokenizer * @private * @function generateTokens * @returns {string[]} Array of all possible tokens */ const generateTokens = () => { const range = (start, end) => Array.from({ length: end - start }, (_, i) => start + i); const hexRange = (start, end) => range(start, end + 1).map(i => i.toString(16)); const sequence = ["A"]; const signbox = ["B", "L", "M", "R"]; const nullToken = ["S000"]; const baseSymbols = range(0x100, 0x38b + 1).map(i => `S${i.toString(16)}`); const rows = hexRange(0, 15).map(i => `r${i}`); const cols = hexRange(0, 5).map(i => `c${i}`); const positions = range(250, 750).map(i => `p${i}`); return [ ...sequence, ...signbox, ...nullToken, ...baseSymbols, ...rows, ...cols, ...positions ]; }; /** * Creates mappings for special tokens * @private * @function createSpecialTokenMappings * @param {Array} specialTokens - Array of special token objects * @returns {Object} Special token mappings */ const createSpecialTokenMappings = (specialTokens) => { const byIndex = {}; const byName = {}; const byValue = {}; const indices = new Set(); specialTokens.forEach(token => { if (indices.has(token.index)) { throw new Error(`Duplicate token index: ${token.index}`); } indices.add(token.index); byIndex[token.index] = token; byName[token.name] = token; byValue[token.value] = token; }); return { byIndex, byName, byValue, getByIndex: (index) => byIndex[index] || byIndex[specialTokens.find(t => t.name === 'UNK').index], getByName: (name) => byName[name] || byName['UNK'], getByValue: (value) => byValue[value] || byName['UNK'], getAllValues: () => specialTokens.map(t => t.value), getAllIndices: () => specialTokens.map(t => t.index) }; }; /** * Creates index-to-string and string-to-index mappings for tokens * @private * @function createTokenMappings * @param {string[]} tokens - Array of tokens to map * @param {Object} specialTokenMappings - Special tokens mapping object * @param {number} startingIndex - Starting index for regular tokens * @returns {Object} Object containing i2s and s2i mappings */ const createTokenMappings = (tokens, specialTokenMappings, startingIndex) => { const i2s = {}; const s2i = {}; // Add special tokens first Object.values(specialTokenMappings.byIndex).forEach(token => { i2s[token.index] = token.value; s2i[token.value] = token.index; }); // Add regular tokens tokens.forEach((token, i) => { const index = startingIndex + i; i2s[index] = token; s2i[token] = index; }); return { i2s, s2i }; }; /** * Tokenizes an FSW string into an array of tokens * @function fsw.tokenize * @param {string} fsw - FSW string to tokenize * @param {Object} options - Tokenization options * @param {boolean} [options.sequence=true] - Whether to include sequence tokens * @param {boolean} [options.signbox=true] - Whether to include signbox tokens * @param {string} [options.sep="[SEP]"] - Separator token * @returns {string[]} Array of tokens * @example * fsw.tokenize("AS10e00M507x515S10e00492x485",{sequence:false,sep:null}) * * return [ * 'M', 'p507', 'p515','S10e', 'c0', 'r0', 'p492', 'p485' * ] */ const tokenize = (fsw, { sequence = true, signbox = true, sep = "[SEP]" } = {}) => { const tokenizeSymbol = (symbol) => [ symbol.slice(0, 4), `c${symbol.charAt(4)}`, `r${symbol.charAt(5)}` ]; const tokenizeCoord = (coord) => coord.map(p => `p${p}`); const segments = parse.text(fsw).map(fswSegment => { if (/[BLMR]/.test(fswSegment)) { const sign = parse.sign(fswSegment); const tokens = []; if (sign.sequence && sequence) { tokens.push("A", ...sign.sequence.map(seqItem => tokenizeSymbol(seqItem)).flat()); } if (signbox) { tokens.push( sign.box, ...tokenizeCoord(sign.max), ...sign.spatials.flatMap(symbol => [ ...tokenizeSymbol(symbol.symbol), ...tokenizeCoord(symbol.coord) ]) ); } return sep ? [...tokens, sep] : tokens; } else { const parsed = parse.symbol(fswSegment); if (!signbox && !sequence) { return []; } let tokens = []; if (!signbox && sequence) { tokens = [ "A", ...tokenizeSymbol(parsed.symbol) ]; } else { tokens = [ "M", ...tokenizeCoord(parsed.coord.map(c => 1000 - c)), ...tokenizeSymbol(parsed.symbol), ...tokenizeCoord(parsed.coord) ]; } return tokens.length > 0 && sep ? [...tokens, sep] : tokens; } }); return segments.flatMap(segment => segment); }; /** * Converts an array of tokens back into an FSW string * @function fsw.detokenize * @param {string[]} tokens - Array of tokens to convert * @param {Array} specialTokens - Array of special token objects to filter out * @returns {string} FSW string * @example * fsw.detokenize(['M', 'p507', 'p515','S10e', 'c0', 'r0', 'p492', 'p485']) * * return "M507x515S10e00492x485" */ const detokenize = (tokens, specialTokens = DEFAULT_SPECIAL_TOKENS) => { const specialValues = new Set(specialTokens.map(t => t.value)); return tokens .filter(t => !specialValues.has(t)) .join(' ') .replace(/\bp(\d{3})\s+p(\d{3})/g, '$1x$2') .replace(/ c(\d)\d? r(.)/g, '$1$2') .replace(/ c(\d)\d?/g, '$10') .replace(/ r(.)/g, '0$1') .replace(/ /g, '') .replace(/(\d)([BLMR])/g, '$1 $2') .replace(/(\d)(AS)/g, '$1 $2') .replace(/(A(?:S00000|S[123][0-9a-f]{2}[0-5][0-9a-f])+)( )([BLMR])/g, '$1$3'); }; /** * Splits tokens into chunks of specified size while preserving sign boundaries * @function fsw.chunkTokens * @param {string[]} tokens - Array of tokens to chunk * @param {number} chunkSize - Maximum size of each chunk * @param {Object} options - Chunking options * @param {string} [options.cls="[CLS]"] - CLS token * @param {string} [options.sep="[SEP]"] - SEP token * @param {string} [options.pad="[PAD]"] - PAD token * @returns {string[][]} Array of token chunks */ const chunkTokens = (tokens, chunkSize, { cls = "[CLS]", sep = "[SEP]", pad = "[PAD]" } = {}) => { if (chunkSize < 60) { throw new Error('Chunk size must be at least 60 tokens to accommodate a typical sign'); } const chunks = []; let currentChunk = []; let tokenIndex = 0; while (tokenIndex < tokens.length) { currentChunk = [cls]; while (tokenIndex < tokens.length) { const token = tokens[tokenIndex]; let lookAhead = tokenIndex; while (lookAhead < tokens.length && tokens[lookAhead] !== sep) { lookAhead++; } const signSize = lookAhead - tokenIndex + 1; if (currentChunk.length + signSize > chunkSize - 1) { break; } while (tokenIndex <= lookAhead) { currentChunk.push(tokens[tokenIndex]); tokenIndex++; } } while (currentChunk.length < chunkSize) { currentChunk.push(pad); } chunks.push(currentChunk); } return chunks; }; /** * Creates a tokenizer object with encoding and decoding capabilities * @function fsw.createTokenizer * @param {Object} [specialTokens] - Special tokens mapping object * @param {number} [startingIndex] - Starting index for regular tokens * @returns {TokenizerObject} Tokenizer object * @example * const t = fsw.createTokenizer() * * t.encode('M507x515S10e00492x485') * * return [7, 941, 949, 24, 678, 662, 926, 919, 3] */ const createTokenizer = ( specialTokens = DEFAULT_SPECIAL_TOKENS, startingIndex = null ) => { const specialTokenMappings = createSpecialTokenMappings(specialTokens); const calculatedStartingIndex = startingIndex ?? (specialTokenMappings.getAllIndices().length > 0 ? Math.max(...specialTokenMappings.getAllIndices()) + 1 : 0); const tokens = generateTokens(); const { i2s, s2i } = createTokenMappings(tokens, specialTokenMappings, calculatedStartingIndex); const tokenizer = { i2s, s2i, specialTokens: specialTokenMappings, length: Object.keys(i2s).length, vocab: () => Object.values(i2s), encodeTokens: (tokens) => tokens.map(t => { return s2i[t] !== undefined ? s2i[t] : specialTokenMappings.getByValue(t).index; }), decodeTokens: (indices) => indices.map(i => i2s[i] || specialTokenMappings.getByName('UNK').value), encode: (text, options = {}) => { const tokens = tokenize(text, { ...options, sep: specialTokenMappings.getByName('SEP').value }); return tokenizer.encodeTokens(tokens); }, decode: (tokens) => { if (tokens.length === 0) return ""; if (Array.isArray(tokens[0])) { const decodedChunks = tokens.map(chunk => tokenizer.decodeTokens(chunk)); return detokenize(decodedChunks.flat(), specialTokens); } const decodedTokens = tokenizer.decodeTokens(tokens); return detokenize(decodedTokens, specialTokens); }, chunk: (tokens, chunkSize) => chunkTokens(tokens, chunkSize, { cls: specialTokenMappings.getByName('CLS').value, sep: specialTokenMappings.getByName('SEP').value, pad: specialTokenMappings.getByName('PAD').value }) }; return tokenizer; }; export { tokenize, detokenize, createTokenizer, chunkTokens };