SET C - NOTEBOOK¶
In [1]:
Copied!
words = open('names.txt', 'r').read().splitlines()
words = open('names.txt', 'r').read().splitlines()
In [2]:
Copied!
import torch
N = torch.zeros((27, 27), dtype = torch.int32)
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
import torch
N = torch.zeros((27, 27), dtype = torch.int32)
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
In [3]:
Copied!
P = N.float()
P /= P.sum(1, keepdim=True)
P = N.float()
P /= P.sum(1, keepdim=True)
In [4]:
Copied!
#Creating the training set of bigrams (x,y)
xs, ys = [], []
for word in words[:1]:
chs = ['.'] + list(word) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
print(ch1, ch2)
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
#Creating the training set of bigrams (x,y)
xs, ys = [], []
for word in words[:1]:
chs = ['.'] + list(word) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
print(ch1, ch2)
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
. e e m m m m a a .
In [5]:
Copied!
#Feeding these examples into a neural network
import torch.nn.functional as F
#Feeding these examples into a neural network
import torch.nn.functional as F
In [ ]:
Copied!
#<=========OPTIMIZATION============>
#<=========OPTIMIZATION============>
In [6]:
Copied!
xs
xs
Out[6]:
tensor([ 0, 5, 13, 13, 1])
In [7]:
Copied!
ys
ys
Out[7]:
tensor([ 5, 13, 13, 1, 0])
In [12]:
Copied!
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)
In [13]:
Copied!
#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)
#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)
In [ ]:
Copied!
loss #This will be similar to the one we also calculated in the SUMMARY part of B-Main
loss #This will be similar to the one we also calculated in the SUMMARY part of B-Main
Out[ ]:
tensor(3.7693)
In [14]:
Copied!
#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()
#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()
In [15]:
Copied!
W.grad.shape
W.grad.shape
Out[15]:
torch.Size([27, 27])
In [ ]:
Copied!
W.grad
W.grad
In [ ]:
Copied!
#UPDATE
W.data += -0.1 * W.grad
#UPDATE
W.data += -0.1 * W.grad
In [ ]:
Copied!
#JUST PUTTING THEM TOGETHER TO PERFORM GRADIENT DESCENT
#JUST PUTTING THEM TOGETHER TO PERFORM GRADIENT DESCENT
In [ ]:
Copied!
#ONLY RUN THIS THE FIRST TIME
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)
#ONLY RUN THIS THE FIRST TIME
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)
In [34]:
Copied!
#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)
#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)
In [35]:
Copied!
print(loss.item()) #CHECKING THE LOSS VALUE
print(loss.item()) #CHECKING THE LOSS VALUE
3.6891887187957764
In [32]:
Copied!
#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()
#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()
In [33]:
Copied!
#UPDATE
W.data += -0.1 * W.grad
#UPDATE
W.data += -0.1 * W.grad
Yay, that worked. Noice
PUTTING THEM ALL TOGETHER
In [36]:
Copied!
# create the dataset
xs, ys = [], []
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)
# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
# create the dataset
xs, ys = [], []
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)
# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
number of examples: 228146
In [37]:
Copied!
# gradient descent
for k in range(20):
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
print(loss.item())
# backward pass
W.grad = None # set to zero the gradient
loss.backward()
# update
W.data += -50 * W.grad
# gradient descent
for k in range(20):
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
print(loss.item())
# backward pass
W.grad = None # set to zero the gradient
loss.backward()
# update
W.data += -50 * W.grad
3.7686190605163574 3.378804922103882 3.1610896587371826 3.0271859169006348 2.9344847202301025 2.867231607437134 2.816654920578003 2.777147054672241 2.7452545166015625 2.7188305854797363 2.6965057849884033 2.6773722171783447 2.6608052253723145 2.6463513374328613 2.633665084838867 2.622471332550049 2.6125471591949463 2.6037065982818604 2.595794439315796 2.5886802673339844
SO WE ALMOST ACHIEVED A VERY LOW LOSS VALUE. SIMILAR TO THE LOSS VALUE WE CALCULATED IN A-MAIN, WHEN WE TYPED OUR OWN NAME AND SAW HOW IT PERFORMS
Finally drumrolls, we are going to see how sampling from this model produces the outputs (Spoiler alert: it will be the same as how we made the model manually, coz... it is the same model just that we made it using Neural nets)
In [38]:
Copied!
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
out = []
ix = 0
while True:
# ----------
# BEFORE:
#p = P[ix]
# ----------
# NOW:
xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
p = counts / counts.sum(1, keepdims=True) # probabilities for next character
# ----------
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
out = []
ix = 0
while True:
# ----------
# BEFORE:
#p = P[ix]
# ----------
# NOW:
xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
p = counts / counts.sum(1, keepdims=True) # probabilities for next character
# ----------
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))
juwjde. janaqah. pxzfby. a. nn.