From 5a00ca63fc0f3ae507af7611d3bfcbf65e26e6e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CShashank?= <shashanktyagiji12345@gmail.com>
Date: Tue, 15 Oct 2024 11:37:28 +0530
Subject: [PATCH] Refactor LSTM network implementation and improve code
 readability

---
 neural_network/lstm.py | 198 ++++++++++++++++++++++++-----------------
 1 file changed, 115 insertions(+), 83 deletions(-)

diff --git a/neural_network/lstm.py b/neural_network/lstm.py
index 5c7a1387c..ae834cdbe 100644
--- a/neural_network/lstm.py
+++ b/neural_network/lstm.py
@@ -7,42 +7,46 @@ Detail: Total 3 layers neural network
 * Output layer
 Author: Shashank Tyagi
 Github: LEVII007
-link : https://www.kaggle.com/code/navjindervirdee/lstm-neural-network-from-scratch
+Date: [Current Date]
 """
 
-##### Explanation #####
-# This script implements a Long Short-Term Memory (LSTM) network to learn
-# and predict sequences of characters.
+#### Explanation #####
+# This script implements a Long Short-Term Memory (LSTM)
+# network to learn and predict sequences of characters.
 # It uses numpy for numerical operations and tqdm for progress visualization.
 
-# The data is a paragraph about LSTM, converted to lowercase and split into
-# characters. Each character is one-hot encoded for training.
+# The data is a paragraph about LSTM, converted to
+# lowercase and split into characters.
+# Each character is one-hot encoded for training.
 
-# The LSTM class initializes weights and biases for the forget, input, candidate,
-# and output gates. It also initializes weights and biases for the final output layer.
+# The LSTM class initializes weights and biases for the
+# forget, input, candidate, and output gates.
+# It also initializes weights and biases for the final output layer.
 
-# The forward method performs forward propagation through the LSTM network,
-# computing hidden and cell states. It uses sigmoid and tanh activation
-# functions for the gates and cell states.
+# The forward method performs forward propagation
+# through the LSTM network, computing hidden and cell states.
+# It uses sigmoid and tanh activation functions for the gates and cell states.
 
-# The backward method performs backpropagation through time, computing gradients
-# for the weights and biases. It updates the weights and biases using
-# the computed gradients and the learning rate.
+# The backward method performs backpropagation
+# through time, computing gradients for the weights and biases.
+# It updates the weights and biases using the
+# computed gradients and the learning rate.
 
-# The train method trains the LSTM network on the input data for a specified
-# number of epochs. It uses one-hot encoded inputs and computes errors
-# using the softmax function.
+# The train method trains the LSTM network on
+# the input data for a specified number of epochs.
+# It uses one-hot encoded inputs and computes
+#  errors using the softmax function.
 
-# The test method evaluates the trained LSTM network on the input data,
-# computing accuracy based on predictions.
+# The test method evaluates the trained LSTM
+# network on the input data, computing accuracy based on predictions.
 
-# The script initializes the LSTM network with specified hyperparameters
-# and trains it on the input data. Finally, it tests the trained network
-# and prints the accuracy of the predictions.
+# The script initializes the LSTM network with
+# specified hyperparameters and trains it on the input data.
+# Finally, it tests the trained network and prints the accuracy of the predictions.
 
 ##### Imports #####
-from tqdm import tqdm
 import numpy as np
+from tqdm import tqdm
 
 
 class LSTM:
@@ -68,7 +72,7 @@ class LSTM:
         print(f"Data size: {self.data_size}, Char Size: {self.char_size}")
 
         self.char_to_idx = {c: i for i, c in enumerate(self.chars)}
-        self.idx_to_char = {i: c for i, c in enumerate(self.chars)}
+        self.idx_to_char = dict(enumerate(self.chars))
 
         self.train_X, self.train_y = self.data[:-1], self.data[1:]
 
@@ -90,30 +94,42 @@ class LSTM:
         """
         Initialize the weights and biases for the LSTM network.
         """
-        self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
+        rng = np.random.default_rng()
+        self.wf = self.init_weights(
+            self.char_size + self.hidden_dim, self.hidden_dim, rng
+        )
         self.bf = np.zeros((self.hidden_dim, 1))
 
-        self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
+        self.wi = self.init_weights(
+            self.char_size + self.hidden_dim, self.hidden_dim, rng
+        )
         self.bi = np.zeros((self.hidden_dim, 1))
 
-        self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
+        self.wc = self.init_weights(
+            self.char_size + self.hidden_dim, self.hidden_dim, rng
+        )
         self.bc = np.zeros((self.hidden_dim, 1))
 
-        self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
+        self.wo = self.init_weights(
+            self.char_size + self.hidden_dim, self.hidden_dim, rng
+        )
         self.bo = np.zeros((self.hidden_dim, 1))
 
-        self.wy = self.init_weights(self.hidden_dim, self.char_size)
+        self.wy = self.init_weights(self.hidden_dim, self.char_size, rng)
         self.by = np.zeros((self.char_size, 1))
 
-    def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray:
+    def init_weights(
+        self, input_dim: int, output_dim: int, rng: np.random.Generator
+    ) -> np.ndarray:
         """
         Initialize weights with random values.
 
         :param input_dim: The input dimension.
         :param output_dim: The output dimension.
+        :param rng: The random number generator.
         :return: A matrix of initialized weights.
         """
-        return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt(
+        return rng.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt(
             6 / (input_dim + output_dim)
         )
 
@@ -280,79 +296,95 @@ class LSTM:
             d_wc += np.dot(d_c, inputs[t].T)
             d_bc += d_c
 
-            # Update the next hidden and cell state errors
-            dh_next = (
+            # Concatenated Input Error (Sum of Error at Each Gate!)
+            d_z = (
                 np.dot(self.wf.T, d_f)
                 + np.dot(self.wi.T, d_i)
-                + np.dot(self.wo.T, d_o)
                 + np.dot(self.wc.T, d_c)
+                + np.dot(self.wo.T, d_o)
             )
-            dc_next = d_cs * self.forget_gates[t]
 
-        # Apply gradients to weights and biases
-        for param, grad in zip(
-            [self.wf, self.wi, self.wc, self.wo, self.wy],
-            [d_wf, d_wi, d_wc, d_wo, d_wy],
-        ):
-            param -= self.lr * grad
+            # Error of Hidden State and Cell State at Next Time Step
+            dh_next = d_z[: self.hidden_dim, :]
+            dc_next = self.forget_gates[t] * d_cs
 
-        for param, grad in zip(
-            [self.bf, self.bi, self.bc, self.bo, self.by],
-            [d_bf, d_bi, d_bc, d_bo, d_by],
-        ):
-            param -= self.lr * grad
+        for d_ in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by):
+            np.clip(d_, -1, 1, out=d_)
+
+        self.wf += d_wf * self.lr
+        self.bf += d_bf * self.lr
+
+        self.wi += d_wi * self.lr
+        self.bi += d_bi * self.lr
+
+        self.wc += d_wc * self.lr
+        self.bc += d_bc * self.lr
+
+        self.wo += d_wo * self.lr
+        self.bo += d_bo * self.lr
+
+        self.wy += d_wy * self.lr
+        self.by += d_by * self.lr
 
     def train(self) -> None:
         """
-        Train the LSTM network on the input data for a specified number of epochs.
+        Train the LSTM network on the input data.
         """
-        for epoch in tqdm(range(self.epochs)):
-            inputs = [self.one_hot_encode(char) for char in self.train_X]
-            targets = [self.one_hot_encode(char) for char in self.train_y]
+        inputs = [self.one_hot_encode(char) for char in self.train_X]
 
-            # Forward pass
-            outputs = self.forward(inputs)
+        for _ in tqdm(range(self.epochs)):
+            predictions = self.forward(inputs)
 
-            # Compute error at each time step
-            errors = [output - target for output, target in zip(outputs, targets)]
+            errors = []
+            for t in range(len(predictions)):
+                errors.append(-self.softmax(predictions[t]))
+                errors[-1][self.char_to_idx[self.train_y[t]]] += 1
 
-            # Backward pass and weight updates
-            self.backward(errors, inputs)
-
-    def predict(self, inputs: list) -> str:
-        """
-        Predict the next character in the sequence.
-
-        :param inputs: The input data as a list of one-hot encoded vectors.
-        :return: The predicted character.
-        """
-        output = self.forward(inputs)[-1]
-        return self.idx_to_char[np.argmax(self.softmax(output))]
+            self.backward(errors, self.concat_inputs)
 
     def test(self) -> None:
         """
-        Test the LSTM network on the input data and compute accuracy.
+        Test the trained LSTM network on the input data and print the accuracy.
         """
-        inputs = [self.one_hot_encode(char) for char in self.train_X]
-        correct_predictions = sum(
-            self.idx_to_char[np.argmax(self.softmax(output))] == target
-            for output, target in zip(self.forward(inputs), self.train_y)
+        accuracy = 0
+        probabilities = self.forward(
+            [self.one_hot_encode(char) for char in self.train_X]
         )
 
-        accuracy = (correct_predictions / len(self.train_y)) * 100
-        print(f"Accuracy: {accuracy:.2f}%")
+        output = ""
+        for t in range(len(self.train_y)):
+            prediction = self.idx_to_char[
+                np.random.choice(
+                    range(self.char_size), p=self.softmax(probabilities[t].reshape(-1))
+                )
+            ]
+
+            output += prediction
+
+            if prediction == self.train_y[t]:
+                accuracy += 1
+
+        print(f"Ground Truth:\n{self.train_y}\n")
+        print(f"Predictions:\n{output}\n")
+
+        print(f"Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%")
 
 
 if __name__ == "__main__":
-    # Define the input data and hyperparameters
-    data = "LSTM Neural Networks are designed to handle sequences of data.This is just rantom test data"
-    # hidden_dim = 50
-    # epochs = 1000
-    # lr = 0.01
+    data = """Long Short-Term Memory (LSTM) networks are a type
+         of recurrent neural network (RNN) capable of learning "
+        "order dependence in sequence prediction problems.
+         This behavior is required in complex problem domains like "
+        "machine translation, speech recognition, and more.
+        iter and Schmidhuber in 1997, and were refined and "
+        "popularized by many people in following work."""
 
-    # # Initialize and train the LSTM network
-    # lstm = LSTM(data, hidden_dim, epochs, lr)
-    # lstm.train()
+    lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05)
 
-    # # Test the LSTM network and compute accuracy
-    # lstm.test()
+    ##### Training #####
+    lstm.train()
+
+    ##### Testing #####
+    lstm.test()
+
+# testing can be done by uncommenting the above lines of code.