Neural Network From Scratch#

References#

Dataset#

[ ]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_classification
[30]:
n_features, n_samples = 10, 1000
x, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_classes=2,
    n_clusters_per_class=1,
    random_state=0,
)
X = x.T
Y = y.reshape(-1, 1).T

Metrics#

Cross-Entropy Loss#

\begin{align*} \text{Loss Function }L(y, \hat{y}) = - [ y \times \log{\hat{y}} + (1 - y) \times \log{(1 - \hat{y})}] \\\\ \text{Cost Function }J(y, \hat{y}) = - \frac{1}{m} \sum_{i=0}^{m}[ y_i \times \log{\hat{y_i}} + (1 - y_i) \times \log{(1 - \hat{y_i})}] \end{align*}

[31]:
def loss(y, y_hat):
    return -((y * np.log(y_hat)) + ((1 - y) * np.log(1 - y_hat))).mean()

Accuracy#

\begin{align*} \text{Accuracy } = \frac{1}{m} \sum_{i=0}^{m} I(y_i == \hat{y_i}) \end{align*}

[32]:
def accuracy(y, y_hat, threshold=0.5):
    return np.int32(np.int32(y_hat > threshold) == y).sum() / y.shape[1]

Activation Function#

\begin{align*} \text{Sigmoid } = \sigma{(x)} &= \frac{1}{1 + e^{-x}}\\ \\ \text{Derivative of Sigmoid } = \frac{\delta \sigma{(x)}}{\delta x} &= \frac{-1}{(1 + e^{-x})^2} . {- e^{-x}}\\ &= \frac{1}{1 + e^{-x}} . \frac{e^{-x}}{1 + e^{-x}}\\ &= \frac{1}{1 + e^{-x}} . \frac{1+e^{-x} - 1}{1 + e^{-x}}\\ \frac{\delta \sigma{(x)}}{\delta x} &= \sigma(x) . (1 - \sigma(x)) \end{align*}

[33]:

def sigmoid(x): return 1 / (1 + np.exp(-x)) def derivative_sigmoid(x): return sigmoid(x) * (1 - sigmoid(x))

Architecture#

                                                                        w2      b2
        X ---------+                                                A   |       |  A
                |             FORWARD PASS   >>>>              dw2  |   V       V  | db2
                V                                                   ^              ^
        w1 ------->>>   z1 = w1.X + b1 ----> a1 = sigmoid(z1) ---->  z2 = w2.a1 + b2  ----> a2 = sigmoid(z2) ---> L (a2, y)
        <+--dw1--   A
                |     <+--dz1--               <+--da1--               <+--dz2--               <+--da2--
        b1 ---------+
        <+--db1--                                                    <<<< BACKWARD PASS

                        |                                          |
LAYER 0 (INPUT)         |              LAYER 1                     |                    LAYER 2 (OUTPUT)
                        |                                          |

\begin{align*} X &= (n \times m ) \\ Y &= (1 \times m ) = z \\ w1 &= (n \times l1 ) = dw1\\ b1 &= (l1 \times 1 ) = db1\\ w2 &= (l1 \times l2 ) = dw2\\ b2 &= (l2 \times 1 ) = db2\\ \end{align*}

Forward Pass#

                                                                        w2      b2
        X ---------+                                                A   |       |  A
                |             FORWARD PASS   >>>>              dw2  |   V       V  | db2
                V                                                   ^              ^
        w1 ------->>>   z1 = w1.X + b1 ----> a1 = sigmoid(z1) ---->  z2 = w2.a1 + b2  ----> a2 = sigmoid(z2) ---> L (a2, y)
        <+--dw1--   A
                |     <+--dz1--               <+--da1--               <+--dz2--               <+--da2--
        b1 ---------+
        <+--db1--                                                    <<<< BACKWARD PASS

                        |                                          |
LAYER 0 (INPUT)         |              LAYER 1                     |                    LAYER 2 (OUTPUT)
                        |                                          |

\begin{align*} z1 &= w1^T .X + b1 \\ (l1 \times n) &= (l1 \times n) (n \times m) + (l1 \times 1) \\\\ a1 &= \sigma{(z1)} \\ (l1 \times n) \\\\ z2 &= w2^T .a1 + b2 \\ (l2 \times l1) &= (l2 \times l1) (l1 \times n) + (l2 \times n) \\\\ a2 &= \sigma{(z2)} \\ (l2 \times l1) \end{align*}

[34]:

def forward(X, w1, b1, w2, b2): z1 = (w1.T @ X) + b1 a1 = sigmoid(z1) z2 = (w2.T @ a1) + b2 a2 = sigmoid(z2) return a2, a1, z2, z1

Backward Pass#

                                                                        w2      b2
        X ---------+                                                A   |       |  A
                |             FORWARD PASS   >>>>              dw2  |   V       V  | db2
                V                                                   ^              ^
        w1 ------->>>   z1 = w1.X + b1 ----> a1 = sigmoid(z1) ---->  z2 = w2.a1 + b2  ----> a2 = sigmoid(z2) ---> L (a2, y)
        <+--dw1--   A
                |     <+--dz1--               <+--da1--               <+--dz2--               <+--da2--
        b1 ---------+
        <+--db1--                                                    <<<< BACKWARD PASS

                        |                                          |
LAYER 0 (INPUT)         |              LAYER 1                     |                    LAYER 2 (OUTPUT)
                        |                                          |

\begin{align*} \text{Loss with final layer value (output) }L(y, a2) &= - [ y \times \log{a2} + (1 - y) \times \log{(1 - a2)}] \\\\\\ da2 = \frac{\delta L}{\delta a2} &= - \frac{y}{a2} + \frac{1 - y}{1 - a2} \\\\ dz2 = \frac{\delta L}{\delta z2} &= \frac{\delta L}{\delta a2} . \frac{\delta a2}{\delta z2} = da2.\frac{\delta a2}{\delta z2} = -( \frac{y}{a2} - \frac{1 - y}{1 - a2} ) . (a2.(1-a2)) = a2 - y \\\\ dw2 = \frac{\delta L}{\delta w2} &= \frac{\delta L}{\delta z2} . \frac{\delta z2}{\delta w2} = dz2 . \frac{\delta z2}{\delta w2} = a1.dz2^T \\\\ db2 = \frac{\delta L}{\delta b2} &= \frac{\delta L}{\delta z2} . \frac{\delta z2}{\delta b2} = dz2 \\\\ da1 = \frac{\delta L}{\delta a1} &= \frac{\delta L}{\delta z2} . \frac{\delta z2}{\delta a1} = dz2 . \frac{\delta z2}{\delta a1} = -( \frac{y}{a2} - \frac{1 - y}{1 - a2} ) . (a2.(1-a2)) . w2 = (a2 - y).w2 = w2 . dz2 \\\\ dz1 = \frac{\delta L}{\delta z1} &= \frac{\delta L}{\delta a1} \frac{\delta a1}{\delta z1} = da1. \frac{\delta a1}{\delta z1}\\\\ dw1 = \frac{\delta L}{\delta w1} &= \frac{\delta L}{\delta z1} . \frac{\delta z1}{\delta w1} = dz1. X = X. dz^T = \frac{1}{m} \sum_{i=0}^{m} x_i dz_i \\\\ db1 = \frac{\delta L}{\delta b1} &= \frac{\delta L}{\delta z1} . \frac{\delta z1}{\delta b1} = dz1. X = dz = \frac{1}{m} \sum_{i=0}^{m} dz_i \end{align*}

[35]:

def backward(X, Y, a1, a2, w1, b1, w2, b2): da2 = -(Y / a2) + ((1 - Y) / (1 - a2)) dz2 = da2 * derivative_sigmoid(a2) dw2 = np.mean(a1 @ dz2.T, axis=1, keepdims=True) db2 = np.mean(dz2, axis=1, keepdims=True) da1 = w2 @ dz2 dz1 = da1 * derivative_sigmoid(a1) dw1 = np.mean(X @ dz1.T, axis=1, keepdims=True) db1 = np.mean(dz1, axis=1, keepdims=True) return da2, da1, dz2, dz1, dw2, dw1, db2, db1

Gradient Descent Algorithm#

  1. Initialize randomly weights and bias

  2. for n epochs >> 1. Complete a forward pass and calculate \(\hat{y}\) >> >> 2. Complete a backward pass and calculate derivatives \(dw2\), \(db2\), \(dw1\) and \(db1\) >> >> 3. Replace weights and bias with derivatives and learning rate \(\alpha\) >> >> w2 := w2 - \(\alpha\). dw2 >> >> b2 := b2 - \(\alpha\). db2
    >> >> w1 := w1 - \(\alpha\). dw1 >> >> b1 := b1 - \(\alpha\). db1
[36]:

n_layer_1_nodes = 10 n_final_output = 1 w1 = np.random.rand(n_features, n_layer_1_nodes) * 0.01 b1 = np.random.rand(n_layer_1_nodes, 1) * 0.01 w2 = np.random.rand(n_layer_1_nodes, n_final_output) * 0.01 b2 = np.random.rand(n_final_output, 1) * 0.01 loss_history = [] accuracy_history = [] lr = 0.001 for epoch in range(1000): a2, a1, z2, z1 = forward(X, w1, b1, w2, b2) da2, da1, dz2, dz1, dw2, dw1, db2, db1 = backward(X, y, a1, a2, w1, b1, w2, b2) w2 = w2 - (lr * dw2) b2 = b2 - (lr * db2) w1 = w1 - (lr * dw1) b1 = b1 - (lr * db1) if epoch % 50 == 0: prediction, _, _, _ = forward(X, w1, b1, w2, b2) loss_val = loss(Y, prediction) accuracy_val = accuracy(Y, prediction) loss_history.append(loss_val) accuracy_history.append(accuracy_val) a2.shape fig, ax = plt.subplots(1, 2, figsize=(8, 4)) ax[0].plot(accuracy_history, "o-") ax[0].set_title("Accuracy") ax[1].plot(loss_history, "o-") ax[1].set_title("Loss") fig.tight_layout() fig.show(warn=False)

../_images/notebooks_nn_from_scratch_19_0.png