text = "Sam I Am"
tokens = tokenize_ascii(text)
tokens
assert ngrams(tokens, 1) == [
('Sam',),
('I',),
('Am',),
(PAD,)]
assert ngrams(tokens, 2) == [
(PAD, 'Sam'),
('Sam', 'I',),
('I', 'Am',),
('Am', PAD,)]
Passing in a custom padding token
pad = '_custom_pad_'
assert ngrams(tokens, 3, pad) == [
(pad, pad, 'Sam'),
(pad, 'Sam', 'I'),
('Sam', 'I', 'Am'),
('I', 'Am', pad)
]
Passing pad=None
removes the padding
assert ngrams(tokens, 2, pad=None) == [
('Sam', 'I',),
('I', 'Am',),
]
Todo: Non-desctructive tokenisation
isinstance(range(0, 10), Generator)
from mlzero.data import *
corpus_wine = data_wine_reviews()['description']
vocab_wine = vocab_threshold(corpus_wine, tokenize_ascii, 20)
vocab_wine
vocab_wine.i2v[0]
vocab_wine.i2v[1]
vocab_wine.encode(PAD)
next(iter(vocab_wine))
vocab_wine.encode('afesgsf')
corpus_wine[0]
encoded = vocab_wine.encode(corpus_wine[0])
encoded
' '.join(vocab_wine.decode(encoded))
The basis of an n-gram language model is count and divide. It needs to contain counts for each n-gram sequence of n tokens that occurs in the text. This is then normalised on per row on the last token:
$$ P\left(w_k \vert w_{k-n+1:k-1}\night) = \frac{C\left(w_{k-n+1:n-1} w_n\night)}{C\left(w_{k-n+1:n-1}\night)} $$
Note that the denominator is precisely the sum of the numerator over all $ w_n $ in the vocabulary $ V $.
$$ C\left(w_{k-n+1:n-1}\night) = \sum_{w \in V} C\left(w_{k-n+1:n-1}w\night) $$
- For calculating a probability/perplexity we need a way to fetch (log) $ P\left(w_k \vert w_{k-n+1:k-1}\night) $
- For generating a random sentence we need a way of fetching the minimum
There are a number of ways we could represent the counts. There are
- A mapping from n tokens to a count (size is the number of distinct n-grams)
- A dense array of size
|V|**n
- A sparse array
- A
counts = count_ngrams(2, vocab_wine, corpus_wine[:10000])
probs = ngram_counts_to_conditional_probability(counts)
probs[0, 230]
doc = corpus_wine[40000]
doc
tokens = vocab_wine.encode(doc)
tokens[:10]
bigrams = ngrams(tokens, 2, pad=PAD_IDX)
bigrams[:10]
probs[bigrams[0]]
[probs[gram] for gram in bigrams][:10]
product([probs[gram] for gram in bigrams])
import numpy as np
start = (PAD_IDX,)
n = 2
tokens = []
context = (PAD_IDX,) * (n-1)
while True:
weights = [probs[context + (x,)] for x in range(len(vocab_wine))]
next_token = np.random.choice(len(weights), p=weights)
if next_token == PAD_IDX:
break
tokens.append(next_token)
context = context[1:] + (next_token,)
tokens[:10]
print(' '.join(vocab_wine.decode(tokens)))
wine_unilm = NaiveNgramLanguageModel(vocab_wine, 1, corpus_wine[:1000])
wine_unilm.top_k(10)
wine_unilm.probability("This is a rich wine.", pad=True)
for _ in range(5):
print(' '.join(wine_unilm.generate()) + '\n')
wine_bilm = NaiveNgramLanguageModel(vocab_wine, 2, corpus_wine[:1000])
wine_bilm.top_k(10)
wine_bilm.top_k(10, [PAD])
wine_bilm.top_k(10, ['fresh'])
wine_bilm.top_k(10, ['This'])
wine_bilm.probability("This is a rich wine.", pad=False)
wine_bilm.probability("This is a rich wine.", pad=True)
wine_bilm.perplexity("This is a rich wine.")
for _ in range(5):
print(' '.join(wine_bilm.generate()) + '\n')
wine_trilm = NaiveNgramLanguageModel(vocab_wine, 3, corpus_wine[:1000])
wine_trilm.top_k(10)
wine_trilm.top_k(10, ['This'])
for _ in range(5):
print(' '.join(wine_trilm.generate()) + '\n')
wine_trilm.perplexity('A touch blossomy against a core of tobacco and a touch of juniper , lots of fresh pineapple , apricot , lemon drop and ginger brightened by crisp acidity .')
Check some examples against hand calculated results
assert flatten_index([0,0,0], 7) == 0
assert unflatten_index(0, 7, 3) == [0,0,0]
We want blocks to be contiguous on the first indices
assert flatten_index([0,0,1], 7) == 1
assert unflatten_index(1, 7, 3) == [0,0,1]
assert flatten_index([5,3,1], 7) == 267
assert unflatten_index(267, 7, 3) == [5,3,1]
assert flatten_index([1], 7) == 1
assert flatten_index([], 7) == 0
assert unflatten_index(0, 7, 0) == []
size = 7
n = 3
assert [unflatten_index(a, size, n) for a in range(size**n)[flatten_index_range([1,3], size, n)]] == [[1,3,x] for x in range(size)]
size = 7
n = 3
assert [unflatten_index(a, size, n) for a in range(size**n)[flatten_index_range([1], size, n)]] == [[1,x,y] for x in range(size) for y in range(size)]
size = 7
n = 3
assert [unflatten_index(a, size, n) for a in range(size**n)[flatten_index_range([6,2,5], size, n)]] == [[6,2,5]]
ts = SparseRowCubeTensor(wine_trilm.counts, size=len(wine_trilm.vocab), n_dimension=wine_trilm.n, dtype=int)
Getting the top 10 items is the same as brute force
assert ts.top_k(10) == dict(list(sorted(wine_trilm.counts.items(), key=lambda x: x[1], reverse=True))[:10])
Try normalising
t_norm = ts.normalize()
This should be the same as sum and divide
t_norm
assert abs(t_norm[0,0] - ts[0,0] / ts[0,0].sum()).max() < 1e-8
t_log = t_norm.transform(np.log)
assert abs(t_log[0,0,1] - np.log(t_norm[0,0,1])) < 1e-8
%time tri_naive = NaiveNgramLanguageModel(vocab_wine, 3, corpus_wine)
%time tri = NgramLanguageModel(vocab_wine, 3, corpus_wine)
assert tri_naive.top_k(10) == tri.top_k(10)
%time _ = tri_naive.top_k(100)
%time _ = tri.top_k(100)
Generation is much faster
%%time
for _ in range(5):
print(' '.join(tri.generate()) + '\n')
%%time
for _ in range(5): tri_naive.generate()
sample_sentence = ' '.join(tri_naive.generate())
sample_sentence
assert abs(tri.perplexity(sample_sentence) - tri_naive.perplexity(sample_sentence)) < 1e-8
This is actually ~10x slower!
%timeit -n 100 tri.perplexity(sample_sentence)
%timeit -n 100 tri_naive.perplexity(sample_sentence)