Tokenization explorer

Before a model sees any text, it gets chopped into tokens. This is a simplified byte-pair-style merge, showing the core idea behind real tokenizers like the one GPT/Claude use.

text = "the lowest of the low"
words = text.split()

# start with character-level tokens
tokens = [list(w) for w in words]

def most_common_pair(tokens):
    from collections import Counter
    pairs = Counter()
    for word in tokens:
        for i in range(len(word) - 1):
            pairs[(word[i], word[i+1])] += 1
    return pairs.most_common(1)[0] if pairs else None

def merge(tokens, pair):
    merged = "".join(pair)
    new_tokens = []
    for word in tokens:
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and (word[i], word[i+1]) == pair:
                new_word.append(merged)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_tokens.append(new_word)
    return new_tokens

print("Start:", tokens)
for step in range(4):
    best = most_common_pair(tokens)
    if not best:
        break
    pair, count = best
    tokens = merge(tokens, pair)
    print(f"Step {step+1}: merged {pair} ({count}x) ->", tokens)