SET B - NOTEBOOK¶
In [1]:
Copied!
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
In [2]:
Copied!
# read in all the words
words = open('names.txt', 'r').read().splitlines()
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
#print(w)
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
#print(''.join(itos[i] for i in context), '--->', itos[ix])
context = context[1:] + [ix] # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)
# read in all the words
words = open('names.txt', 'r').read().splitlines()
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
#print(w)
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
#print(''.join(itos[i] for i in context), '--->', itos[ix])
context = context[1:] + [ix] # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)
In [3]:
Copied!
X.shape, X.dtype, Y.shape, Y.dtype
X.shape, X.dtype, Y.shape, Y.dtype
Out[3]:
(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)
In [4]:
Copied!
g = torch.Generator().manual_seed(2147483647) #For consistency ofcourse, to keep the same values as andrej
C = torch.randn((27,2), generator=g)
W1 = torch.rand((6, 100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100, 27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]
g = torch.Generator().manual_seed(2147483647) #For consistency ofcourse, to keep the same values as andrej
C = torch.randn((27,2), generator=g)
W1 = torch.rand((6, 100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100, 27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]
In [5]:
Copied!
emb = C[X]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = - prob[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y)
loss
emb = C[X]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = - prob[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y)
loss
Out[5]:
tensor(6.4365)
In [ ]:
Copied!
#Setting up the training of the Neural Net
#Setting up the training of the Neural Net
In [5]:
Copied!
for p in parameters:
p.requires_grad = True #Coz we know PyTorch asks for this parameter, as it is set to false by default
for p in parameters:
p.requires_grad = True #Coz we know PyTorch asks for this parameter, as it is set to false by default
In [ ]:
Copied!
for _ in range(10):
#forward pass
emb = C[X]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss)
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
for p in parameters:
p.data += -0.1 * p.grad
# print(loss.item())
for _ in range(10):
#forward pass
emb = C[X]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss)
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
for p in parameters:
p.data += -0.1 * p.grad
# print(loss.item())
tensor(5.9912, grad_fn=<NllLossBackward0>) tensor(4.9723, grad_fn=<NllLossBackward0>) tensor(4.6059, grad_fn=<NllLossBackward0>) tensor(4.3298, grad_fn=<NllLossBackward0>) tensor(4.1185, grad_fn=<NllLossBackward0>) tensor(3.9586, grad_fn=<NllLossBackward0>) tensor(3.8382, grad_fn=<NllLossBackward0>) tensor(3.7435, grad_fn=<NllLossBackward0>) tensor(3.6644, grad_fn=<NllLossBackward0>) tensor(3.5960, grad_fn=<NllLossBackward0>)
Adding mini-batches
In [10]:
Copied!
for _ in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]] #added for X
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi]) #added for Y
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
for p in parameters:
p.data += -0.1 * p.grad
print(loss.item())
for _ in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]] #added for X
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi]) #added for Y
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
for p in parameters:
p.data += -0.1 * p.grad
print(loss.item())
2.398618459701538
Finding a good learning rate
In [11]:
Copied!
X.shape, X.dtype, Y.shape, Y.dtype
X.shape, X.dtype, Y.shape, Y.dtype
Out[11]:
(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)
In [22]:
Copied!
#Everytime you wanna restart just run this to reset the parameters
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.rand((6, 100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100, 27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]
#Everytime you wanna restart just run this to reset the parameters
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.rand((6, 100), generator=g)
b1 = torch.rand(100, generator=g)
W2 = torch.rand((100, 27), generator=g)
b2 = torch.rand(27, generator=g)
parameters = [C, W1, b1, W2, b2]
In [23]:
Copied!
for p in parameters:
p.requires_grad = True
for p in parameters:
p.requires_grad = True
In [24]:
Copied!
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
In [15]:
Copied!
lri = []
lossi = []
for i in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi])
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
lr = lrs[i]
for p in parameters:
p.data += -0.1 * p.grad
#keeping track
lri.append(lr)
lossi.append(loss.item())
print(loss.item())
lri = []
lossi = []
for i in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi])
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
lr = lrs[i]
for p in parameters:
p.data += -0.1 * p.grad
#keeping track
lri.append(lr)
lossi.append(loss.item())
print(loss.item())
2.419145107269287
In [16]:
Copied!
plt.plot(lri, lossi)
plt.plot(lri, lossi)
Out[16]:
[<matplotlib.lines.Line2D at 0x225085e9b40>]
But we would like to see which exponent value is recommended to use, so we'll update the x-axis
In [20]:
Copied!
#Remember to reset the parameters and only then run this
lri = []
lossi = []
for i in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi])
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
lr = lrs[i]
for p in parameters:
p.data += -0.1 * p.grad
#keeping track
lri.append(lre[i]) #We are taking the exponent of the learning rate for the x-axis
lossi.append(loss.item())
print(loss.item())
#Remember to reset the parameters and only then run this
lri = []
lossi = []
for i in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi])
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
lr = lrs[i]
for p in parameters:
p.data += -0.1 * p.grad
#keeping track
lri.append(lre[i]) #We are taking the exponent of the learning rate for the x-axis
lossi.append(loss.item())
print(loss.item())
2.705171585083008
In [21]:
Copied!
plt.plot(lri, lossi)
plt.plot(lri, lossi)
Out[21]:
[<matplotlib.lines.Line2D at 0x22507dc3850>]
^Here exp of -1
is the closest to where the loss is less, so exponent of -1 is 0.1, which was the actual value we had considered anyway
Just to cross-check we'll directly plot that value and see
In [25]:
Copied!
#Remember to reset the parameters and only then run this
lri = []
lossi = []
for i in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi])
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
lr = lrs[i]
for p in parameters:
p.data += -0.1 * p.grad
#keeping track
lri.append(lrs[i]) #We are taking the exponent of the learning rate for the x-axis
lossi.append(loss.item())
print(loss.item())
#Remember to reset the parameters and only then run this
lri = []
lossi = []
for i in range(1000):
#Minibatch
xi = torch.randint(0, X.shape[0], (32,))
#forward pass
emb = C[X[xi]]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y[xi])
#print(loss.item())
#backward pass
for p in parameters:
p.grad = None
loss.backward()
#update
lr = lrs[i]
for p in parameters:
p.data += -0.1 * p.grad
#keeping track
lri.append(lrs[i]) #We are taking the exponent of the learning rate for the x-axis
lossi.append(loss.item())
print(loss.item())
2.7444117069244385
In [26]:
Copied!
plt.plot(lri, lossi)
plt.plot(lri, lossi)
Out[26]:
[<matplotlib.lines.Line2D at 0x22507e03700>]
Yeah 0.1
seems fair I guess lol