UNPKG

bpe-tokenizer-encoding

Version:

A simple JavaScript implementation of Byte Pair Encoding (BPE)

52 lines (41 loc) 1.19 kB
// index.js function getPairs(tokens) { const pairs = {}; for (let i = 0; i < tokens.length - 1; i++) { const pair = `${tokens[i]} ${tokens[i + 1]}`; pairs[pair] = (pairs[pair] || 0) + 1; } return pairs; } function mergeMostFrequent(tokens, pairToMerge) { const [first, second] = pairToMerge.split(" "); const newToken = first + second; const newTokens = []; let i = 0; while (i < tokens.length) { if (i < tokens.length - 1 && tokens[i] === first && tokens[i + 1] === second) { newTokens.push(newToken); i += 2; } else { newTokens.push(tokens[i]); i += 1; } } return newTokens; } function bytePairEncoding(text, numMerges = 10, verbose = false) { let tokens = text.split(""); for (let i = 0; i < numMerges; i++) { const pairs = getPairs(tokens); if (Object.keys(pairs).length === 0) break; const mostFrequent = Object.entries(pairs).sort((a, b) => b[1] - a[1])[0][0]; tokens = mergeMostFrequent(tokens, mostFrequent); if (verbose) { console.log(`Step ${i + 1}: Merged '${mostFrequent}' ->`, tokens.join(" ")); } } return tokens; } module.exports = { bytePairEncoding, };