Logistic Regression From Scratch#

References#

Dataset#

[ ]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_classification

[30]:
n_features, n_samples = 10, 1000

x, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_classes=2,
    n_clusters_per_class=1,
    random_state=0,
)
X = x.T
Y = y.reshape(-1, 1).T

Metrics#

Cross-Entropy Loss#

\begin{align*} \text{Loss Function }L(y, \hat{y}) = - [ y \times \log{\hat{y}} + (1 - y) \times \log{(1 - \hat{y})}] \\\\ \text{Cost Function }J(y, \hat{y}) = - \frac{1}{m} \sum_{i=0}^{m}[ y_i \times \log{\hat{y_i}} + (1 - y_i) \times \log{(1 - \hat{y_i})}] \end{align*}

[31]:
def loss(y, y_hat):
    return -((y * np.log(y_hat)) + ((1 - y) * np.log(1 - y_hat))).mean()

Accuracy#

\begin{align*} \text{Accuracy } = \frac{1}{m} \sum_{i=0}^{m} I(y_i == \hat{y_i}) \end{align*}

[32]:

def accuracy(y, y_hat, threshold=0.5): return np.int32(np.int32(y_hat > threshold) == y).sum() / y.shape[1]

Activation Function#

\begin{align*} \text{Sigmoid } = \sigma{(x)} &= \frac{1}{1 + e^{-x}}\\ \\ \text{Derivative of Sigmoid } = \frac{\delta \sigma{(x)}}{\delta x} &= \frac{-1}{(1 + e^{-x})^2} . {- e^{-x}}\\ &= \frac{1}{1 + e^{-x}} . \frac{e^{-x}}{1 + e^{-x}}\\ &= \frac{1}{1 + e^{-x}} . \frac{1+e^{-x} - 1}{1 + e^{-x}}\\ \frac{\delta \sigma{(x)}}{\delta x} &= \sigma(x) . (1 - \sigma(x)) \end{align*}

[33]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def derivative_sigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))

Architecture#

        X ----------+
                |               FORWARD PASS >>>>
                V
        w --------+>    z = w.X + b ----> a = sigmoid(z) ----> L(a, y)
        <--dw--     A
                |     <+--dz--           <+--da--
        b ----------+               <<<<< BACKWARD PASS
        <--db--

                         |                                      |
LAYER 0 (INPUT)          |              Layer 1 (OUTPUT)        |

\begin{align*} X &= (n \times m ) \\ Y &= (1 \times m ) = z \\ w &= (n \times 1 ) = dw\\ b &= (1 \times 1 ) = db\\ \end{align*}

[34]:
X.shape, Y.shape

[34]:
((10, 1000), (1, 1000))

Forward Pass#

X ----------+
            |               FORWARD PASS >>>>
            V
w --------+>    z = w.X + b ----> a = sigmoid(z) ----> L(a, y)
<--dw--     A
            |     <+--dz--           <+--da--
b ----------+               <<<<< BACKWARD PASS
<--db--

\begin{align*} z &= w^T . X + b \\ (1 \times m) &= (1 \times n) . (n \times m) + (1 \times 1) \\\\ a &= \sigma{(z)} \\ (1 \times m) &= (1 \times m) \end{align*}

[35]:
def forward(X, w, b):

    z = (w.T @ X) + b
    a = sigmoid(z)

    return a

Backward Pass#

X ----------+
            |               FORWARD PASS >>>>
            V
w --------+>    z = w.X + b ----> a = sigmoid(z) ----> L(a, y)
<--dw--     A
            |     <+--dz--           <+--da--
b ----------+               <<<<< BACKWARD PASS
<--db--

\begin{align*} L(y, a) &= - [ y \times \log{a} + (1 - y) \times \log{(1 - a)}] \\\\ da = \frac{\delta L}{\delta a} &= - \frac{y}{a} + \frac{1 - y}{1 - a} \\ (1 \times m) \\\\ dz = \frac{\delta L}{\delta z} &= \frac{\delta L}{\delta a} . \frac{\delta a}{\delta z} = da.\frac{\delta a}{\delta z} = -( \frac{y}{a} - \frac{1 - y}{1 - a} ) . (a(1-a)) = a - y \\ (1 \times m) \\\\ dw = \frac{\delta L}{\delta w} &= \frac{\delta L}{\delta z} . \frac{\delta z}{\delta w} = X dz^T= \frac{1}{m} \sum_{i=0}^{m}x_i. dz_i^T \\ (1 \times n) \\\\ db = \frac{\delta L}{\delta b} &= \frac{\delta L}{\delta z} . \frac{\delta z}{\delta b} = \frac{1}{m} \sum_{i=0}^{m} dz_i \\ (1 \times 1) \end{align*}

[36]:
def backward(X, Y, a):
    da = -(Y / a) + ((1 - Y) / (1 - a))
    dz = da * derivative_sigmoid(a)

    # dz = a - y

    dw = np.mean(X @ dz.T, axis=1, keepdims=True)
    db = np.mean(dz, axis=1, keepdims=True)

    return dw, db

Gradient Descent Algorithm#

  1. Initialize randomly weights with shape \(( n_{features} \times 1 )\) and bias \((1 \times 1)\)

  2. for n epochs >> 1. Complete a forward pass and calculate \(\hat{y}\) >> >> 2. Complete a backward pass and calculate derivatives \(dw\) and \(db\) >> >> 3. Replace weights and bias with derivatives and learning rate \(\alpha\) >> >> w := w - \(\alpha\). dw >> >> b := b - \(\alpha\). db

[37]:

w = np.random.rand(n_features, 1) * 0.01 b = np.random.rand(1, 1) * 0.01 loss_history = [] accuracy_history = [] lr = 0.001 for epoch in range(100): y_hat = a = forward(X, w, b) dw, db = backward(X, Y, a) w = w - (lr * dw) b = b - (lr * db) if epoch % 10 == 0: prediction = forward(X, w, b) loss_val = loss(Y, prediction) accuracy_val = accuracy(Y, prediction) loss_history.append(loss_val) accuracy_history.append(accuracy_val) fig, ax = plt.subplots(1, 2, figsize=(8, 4)) ax[0].plot(accuracy_history, "o-") ax[0].set_title("Accuracy") ax[1].plot(loss_history, "o-") ax[1].set_title("Loss") fig.tight_layout() fig.show(warn=False)
../_images/notebooks_logistic_regression_from_scratch_20_0.png