Manual Gradient Descent Optimization

In [1]:

Copied!





from graphviz import Digraph

def trace(root):
    #Builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) #LR == Left to Right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        #For any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % ( n.label, n.data, n.grad), shape='record')
        if n._op:
            #If this value is a result of some operation, then create an op node for it
            dot.node(name = uid + n._op, label=n._op)
            #and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        #Connect n1 to the node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot
from graphviz import Digraph

def trace(root):
    #Builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) #LR == Left to Right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        #For any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % ( n.label, n.data, n.grad), shape='record')
        if n._op:
            #If this value is a result of some operation, then create an op node for it
            dot.node(name = uid + n._op, label=n._op)
            #and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        #Connect n1 to the node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot

In [2]:

Copied!

import math
import math

In [3]:

Copied!





class Value:

    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None #Its an empty function by default. This is what will do that gradient calculation at each of the operations.
        self._prev = set(_children)
        self._op = _op
        self.label = label


    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def backward():
          self.grad += 1.0 * out.grad
          other.grad += 1.0 * out.grad

        out._backward = backward
        return out

    def __radd__(self, other): #here
        return self + other

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def backward():
          self.grad += other.data * out.grad
          other.grad += self.data * out.grad
        out._backward = backward
        return out

    def __rmul__(self, other):   #other * self
        return self * other

    def __truediv__(self, other):  #self/other
        return self * other**-1

    def __neg__(self):
        return self * -1

    def __sub__(self, other):  #self - other
        return self + (-other)

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data ** other, (self, ), f"**{other}")

        def backward():
          self.grad += (other * (self.data ** (other - 1))) * out.grad

        out._backward = backward
        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def backward():
          self.grad += 1 - (t**2) * out.grad

        out._backward = backward
        return out

    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')   #We merged t and out, into just out

        def backward():
            self.grad += out.data * out.grad

        out._backward = backward
        return out

    def backward(self):

      topo = []
      visited = set()
      def build_topo(v):
        if v not in visited:
          visited.add(v)
          for child in v._prev:
            build_topo(child)
          topo.append(v)

      build_topo(self)

      self.grad = 1.0
      for node in reversed(topo):
        node._backward()
class Value:

    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None #Its an empty function by default. This is what will do that gradient calculation at each of the operations.
        self._prev = set(_children)
        self._op = _op
        self.label = label


    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def backward():
          self.grad += 1.0 * out.grad
          other.grad += 1.0 * out.grad

        out._backward = backward
        return out

    def __radd__(self, other): #here
        return self + other

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def backward():
          self.grad += other.data * out.grad
          other.grad += self.data * out.grad
        out._backward = backward
        return out

    def __rmul__(self, other):   #other * self
        return self * other

    def __truediv__(self, other):  #self/other
        return self * other**-1

    def __neg__(self):
        return self * -1

    def __sub__(self, other):  #self - other
        return self + (-other)

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data ** other, (self, ), f"**{other}")

        def backward():
          self.grad += (other * (self.data ** (other - 1))) * out.grad

        out._backward = backward
        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def backward():
          self.grad += 1 - (t**2) * out.grad

        out._backward = backward
        return out

    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')   #We merged t and out, into just out

        def backward():
            self.grad += out.data * out.grad

        out._backward = backward
        return out

    def backward(self):

      topo = []
      visited = set()
      def build_topo(v):
        if v not in visited:
          visited.add(v)
          for child in v._prev:
            build_topo(child)
          topo.append(v)

      build_topo(self)

      self.grad = 1.0
      for node in reversed(topo):
        node._backward()

In [4]:

Copied!

import random
import random

In [5]:

Copied!





class Neuron:
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x):
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

        # Alternative way of writing the above return function:
        # parameters = []
        # for n in self.neurons:
        #   p = n.parameters()
        #   parameters.extend(p)

class MLP:
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]
class Neuron:
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x):
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

        # Alternative way of writing the above return function:
        # parameters = []
        # for n in self.neurons:
        #   p = n.parameters()
        #   parameters.extend(p)

class MLP:
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

Now we are trying to slighly nudge the value in order to reduce the loss

So this essentially adds as an update function

In [12]:

Copied!

for p in n.parameters():
  p.data += -0.01 * p.grad #The negative sign is to convert any negative value to positive. Therefore increasing the value of the data, therefore decresing the loss
for p in n.parameters():
  p.data += -0.01 * p.grad #The negative sign is to convert any negative value to positive. Therefore increasing the value of the data, therefore decresing the loss

Now we follow three steps: Forward pass -> Backward pass -> Update

In [35]:

Copied!

x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Out[35]:

Value(data=0.33215137965743546)

In [36]:

Copied!





xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

ys = [1.0, -1.0, -1.0, 1.0] #output we want
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

ys = [1.0, -1.0, -1.0, 1.0] #output we want

In [54]:

Copied!





#forward pass
ypred = [n(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
loss
#forward pass
ypred = [n(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
loss

Out[54]:

Value(data=5.767047506521353)

In [55]:

Copied!

#backward pass
loss.backward()
#backward pass
loss.backward()

In [56]:

Copied!

#update
for p in n.parameters():
  p.data += -0.01 * p.grad
#update
for p in n.parameters():
  p.data += -0.01 * p.grad

In [57]:

Copied!

#check the prediction
ypred
#check the prediction
ypred

Out[57]:

[Value(data=-0.25151630590655727),
 Value(data=0.42164884655021817),
 Value(data=-0.09631033350969018),
 Value(data=-0.16748189979649136)]

Putting the entire process together in a single function

In [58]:

Copied!





#Initialize the neural net
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)
#Initialize the neural net
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Out[58]:

Value(data=0.9135198339971514)

In [59]:

Copied!





#Data definition
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

ys = [1.0, -1.0, -1.0, 1.0] #output we want
#Data definition
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]

ys = [1.0, -1.0, -1.0, 1.0] #output we want

In [92]:

Copied!





for k in range(10):

  #forward pass
  ypred = [n(x) for x in xs]
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

  #backward pass
  for p in n.parameters():
    p.grad = 0.0 #This is because after one round of update, we need to reset the value of the grads so that it can calculate and store the grad value of the updated loss function (i.e. The loss value that was improved after gradient descent). If we don't do this, the previous value of grad gets increamented with the new value during each back propagation (each time backward is called)
  loss.backward()

  #update
  #THIS HERE, WHAT WE ARE DOING IS 'GRADIENT DESCENT'. WE ARE NUDGING THE INPUT VALUES BY A LITTLE BIT
  for p in n.parameters():
    p.data += -0.04 * p.grad

  print(k, loss.data) #Printing the current number/iteration number plus how much loss

for k in range(10):

  #forward pass
  ypred = [n(x) for x in xs]
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

  #backward pass
  for p in n.parameters():
    p.grad = 0.0 #This is because after one round of update, we need to reset the value of the grads so that it can calculate and store the grad value of the updated loss function (i.e. The loss value that was improved after gradient descent). If we don't do this, the previous value of grad gets increamented with the new value during each back propagation (each time backward is called)
  loss.backward()

  #update
  #THIS HERE, WHAT WE ARE DOING IS 'GRADIENT DESCENT'. WE ARE NUDGING THE INPUT VALUES BY A LITTLE BIT
  for p in n.parameters():
    p.data += -0.04 * p.grad

  print(k, loss.data) #Printing the current number/iteration number plus how much loss

0 7.6021312440956095
1 8.0
2 6.398187062451399
3 7.999999999997639
4 8.0
5 7.999964084143684
6 8.0
7 8.0
8 7.999999961266539
9 8.0

In [93]:

Copied!

ypred
ypred

Out[93]:

[Value(data=-1.0), Value(data=-1.0), Value(data=-1.0), Value(data=-1.0)]

In [94]:

Copied!

loss
loss

Out[94]:

Value(data=8.0)

If the loss was reduced, then you can n.parameters to see what were the values into the NN that caused to get the desired target outputs

Okay so the predicted output didn't exactly come as expected 🥲 (The first and last value weren't supposed to be negative lol)

But that was the idea of how we train a neural net!