bpe-tokenizer-encoding
Version:
A simple JavaScript implementation of Byte Pair Encoding (BPE)
52 lines (41 loc) • 1.19 kB
JavaScript
// index.js
function getPairs(tokens) {
const pairs = {};
for (let i = 0; i < tokens.length - 1; i++) {
const pair = `${tokens[i]} ${tokens[i + 1]}`;
pairs[pair] = (pairs[pair] || 0) + 1;
}
return pairs;
}
function mergeMostFrequent(tokens, pairToMerge) {
const [first, second] = pairToMerge.split(" ");
const newToken = first + second;
const newTokens = [];
let i = 0;
while (i < tokens.length) {
if (i < tokens.length - 1 && tokens[i] === first && tokens[i + 1] === second) {
newTokens.push(newToken);
i += 2;
} else {
newTokens.push(tokens[i]);
i += 1;
}
}
return newTokens;
}
function bytePairEncoding(text, numMerges = 10, verbose = false) {
let tokens = text.split("");
for (let i = 0; i < numMerges; i++) {
const pairs = getPairs(tokens);
if (Object.keys(pairs).length === 0) break;
const mostFrequent = Object.entries(pairs).sort((a, b) => b[1] - a[1])[0][0];
tokens = mergeMostFrequent(tokens, mostFrequent);
if (verbose) {
console.log(`Step ${i + 1}: Merged '${mostFrequent}' ->`, tokens.join(" "));
}
}
return tokens;
}
module.exports = {
bytePairEncoding,
};