SET C - NOTEBOOK¶

In [1]:

Copied!

words = open('names.txt', 'r').read().splitlines()
words = open('names.txt', 'r').read().splitlines()

In [2]:

Copied!

import torch

N = torch.zeros((27, 27), dtype = torch.int32)

chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}
import torch

N = torch.zeros((27, 27), dtype = torch.int32)

chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

In [3]:

Copied!

P = N.float()
P /= P.sum(1, keepdim=True)
P = N.float()
P /= P.sum(1, keepdim=True)

In [4]:

Copied!





#Creating the training set of bigrams (x,y)
xs, ys = [], []

for word in words[:1]:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
#Creating the training set of bigrams (x,y)
xs, ys = [], []

for word in words[:1]:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

. e
e m
m m
m a
a .

In [5]:

Copied!

#Feeding these examples into a neural network
import torch.nn.functional as F
#Feeding these examples into a neural network
import torch.nn.functional as F

In [ ]:

Copied!

#<=========OPTIMIZATION============>
#<=========OPTIMIZATION============>

In [6]:

Copied!

xs
xs

Out[6]:

tensor([ 0,  5, 13, 13,  1])

In [7]:

Copied!

ys
ys

Out[7]:

tensor([ 5, 13, 13,  1,  0])

In [12]:

Copied!

# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)

In [13]:

Copied!





#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)
#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)

In [ ]:

Copied!

loss #This will be similar to the one we also calculated in the SUMMARY part of B-Main
loss #This will be similar to the one we also calculated in the SUMMARY part of B-Main

Out[ ]:

tensor(3.7693)

In [14]:

Copied!

#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()
#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()

In [15]:

Copied!

W.grad.shape
W.grad.shape

Out[15]:

torch.Size([27, 27])

In [ ]:

Copied!

W.grad
W.grad

In [ ]:

Copied!

#UPDATE
W.data += -0.1 * W.grad
#UPDATE
W.data += -0.1 * W.grad

In [ ]:

Copied!

#JUST PUTTING THEM TOGETHER TO PERFORM GRADIENT DESCENT
#JUST PUTTING THEM TOGETHER TO PERFORM GRADIENT DESCENT

In [ ]:

Copied!





#ONLY RUN THIS THE FIRST TIME
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)
#ONLY RUN THIS THE FIRST TIME
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)

In [34]:

Copied!





#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)
#FORWARD PASS
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)

In [35]:

Copied!

print(loss.item()) #CHECKING THE LOSS VALUE
print(loss.item()) #CHECKING THE LOSS VALUE

3.6891887187957764

In [32]:

Copied!

#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()
#BACKWARD PASS
W.grad = None #the gradient is first set to zero
loss.backward()

In [33]:

Copied!

#UPDATE
W.data += -0.1 * W.grad
#UPDATE
W.data += -0.1 * W.grad

Yay, that worked. Noice

PUTTING THEM ALL TOGETHER

In [36]:

Copied!





# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146

In [37]:

Copied!





# gradient descent
for k in range(20):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad
# gradient descent
for k in range(20):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

3.7686190605163574
3.378804922103882
3.1610896587371826
3.0271859169006348
2.9344847202301025
2.867231607437134
2.816654920578003
2.777147054672241
2.7452545166015625
2.7188305854797363
2.6965057849884033
2.6773722171783447
2.6608052253723145
2.6463513374328613
2.633665084838867
2.622471332550049
2.6125471591949463
2.6037065982818604
2.595794439315796
2.5886802673339844

SO WE ALMOST ACHIEVED A VERY LOW LOSS VALUE. SIMILAR TO THE LOSS VALUE WE CALCULATED IN A-MAIN, WHEN WE TYPED OUR OWN NAME AND SAW HOW IT PERFORMS

Finally drumrolls, we are going to see how sampling from this model produces the outputs (Spoiler alert: it will be the same as how we made the model manually, coz... it is the same model just that we made it using Neural nets)

In [38]:

Copied!





# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

juwjde.
janaqah.
pxzfby.
a.
nn.