2024-10-15 05:08:46 +00:00

359 lines
12 KiB
Python

"""
Name - - LSTM - Long Short-Term Memory Network For Sequence Prediction
Goal - - Predict sequences of data
Detail: Total 3 layers neural network
* Input layer
* LSTM layer
* Output layer
Author: Shashank Tyagi
Github: LEVII007
link : https://www.kaggle.com/code/navjindervirdee/lstm-neural-network-from-scratch
"""
##### Explanation #####
# This script implements a Long Short-Term Memory (LSTM) network to learn
# and predict sequences of characters.
# It uses numpy for numerical operations and tqdm for progress visualization.
# The data is a paragraph about LSTM, converted to lowercase and split into
# characters. Each character is one-hot encoded for training.
# The LSTM class initializes weights and biases for the forget, input, candidate,
# and output gates. It also initializes weights and biases for the final output layer.
# The forward method performs forward propagation through the LSTM network,
# computing hidden and cell states. It uses sigmoid and tanh activation
# functions for the gates and cell states.
# The backward method performs backpropagation through time, computing gradients
# for the weights and biases. It updates the weights and biases using
# the computed gradients and the learning rate.
# The train method trains the LSTM network on the input data for a specified
# number of epochs. It uses one-hot encoded inputs and computes errors
# using the softmax function.
# The test method evaluates the trained LSTM network on the input data,
# computing accuracy based on predictions.
# The script initializes the LSTM network with specified hyperparameters
# and trains it on the input data. Finally, it tests the trained network
# and prints the accuracy of the predictions.
##### Imports #####
from tqdm import tqdm
import numpy as np
class LSTM:
def __init__(
self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: float = 0.05
) -> None:
"""
Initialize the LSTM network with the given data and hyperparameters.
:param data: The input data as a string.
:param hidden_dim: The number of hidden units in the LSTM layer.
:param epochs: The number of training epochs.
:param lr: The learning rate.
"""
self.data = data.lower()
self.hidden_dim = hidden_dim
self.epochs = epochs
self.lr = lr
self.chars = set(self.data)
self.data_size, self.char_size = len(self.data), len(self.chars)
print(f"Data size: {self.data_size}, Char Size: {self.char_size}")
self.char_to_idx = {c: i for i, c in enumerate(self.chars)}
self.idx_to_char = {i: c for i, c in enumerate(self.chars)}
self.train_X, self.train_y = self.data[:-1], self.data[1:]
self.initialize_weights()
##### Helper Functions #####
def one_hot_encode(self, char: str) -> np.ndarray:
"""
One-hot encode a character.
:param char: The character to encode.
:return: A one-hot encoded vector.
"""
vector = np.zeros((self.char_size, 1))
vector[self.char_to_idx[char]] = 1
return vector
def initialize_weights(self) -> None:
"""
Initialize the weights and biases for the LSTM network.
"""
self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
self.bf = np.zeros((self.hidden_dim, 1))
self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
self.bi = np.zeros((self.hidden_dim, 1))
self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
self.bc = np.zeros((self.hidden_dim, 1))
self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim)
self.bo = np.zeros((self.hidden_dim, 1))
self.wy = self.init_weights(self.hidden_dim, self.char_size)
self.by = np.zeros((self.char_size, 1))
def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray:
"""
Initialize weights with random values.
:param input_dim: The input dimension.
:param output_dim: The output dimension.
:return: A matrix of initialized weights.
"""
return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt(
6 / (input_dim + output_dim)
)
##### Activation Functions #####
def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray:
"""
Sigmoid activation function.
:param x: The input array.
:param derivative: Whether to compute the derivative.
:return: The sigmoid activation or its derivative.
"""
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray:
"""
Tanh activation function.
:param x: The input array.
:param derivative: Whether to compute the derivative.
:return: The tanh activation or its derivative.
"""
if derivative:
return 1 - x**2
return np.tanh(x)
def softmax(self, x: np.ndarray) -> np.ndarray:
"""
Softmax activation function.
:param x: The input array.
:return: The softmax activation.
"""
exp_x = np.exp(x - np.max(x))
return exp_x / exp_x.sum(axis=0)
##### LSTM Network Methods #####
def reset(self) -> None:
"""
Reset the LSTM network states.
"""
self.concat_inputs = {}
self.hidden_states = {-1: np.zeros((self.hidden_dim, 1))}
self.cell_states = {-1: np.zeros((self.hidden_dim, 1))}
self.activation_outputs = {}
self.candidate_gates = {}
self.output_gates = {}
self.forget_gates = {}
self.input_gates = {}
self.outputs = {}
def forward(self, inputs: list) -> list:
"""
Perform forward propagation through the LSTM network.
:param inputs: The input data as a list of one-hot encoded vectors.
:return: The outputs of the network.
"""
self.reset()
outputs = []
for t in range(len(inputs)):
self.concat_inputs[t] = np.concatenate(
(self.hidden_states[t - 1], inputs[t])
)
self.forget_gates[t] = self.sigmoid(
np.dot(self.wf, self.concat_inputs[t]) + self.bf
)
self.input_gates[t] = self.sigmoid(
np.dot(self.wi, self.concat_inputs[t]) + self.bi
)
self.candidate_gates[t] = self.tanh(
np.dot(self.wc, self.concat_inputs[t]) + self.bc
)
self.output_gates[t] = self.sigmoid(
np.dot(self.wo, self.concat_inputs[t]) + self.bo
)
self.cell_states[t] = (
self.forget_gates[t] * self.cell_states[t - 1]
+ self.input_gates[t] * self.candidate_gates[t]
)
self.hidden_states[t] = self.output_gates[t] * self.tanh(
self.cell_states[t]
)
outputs.append(np.dot(self.wy, self.hidden_states[t]) + self.by)
return outputs
def backward(self, errors: list, inputs: list) -> None:
"""
Perform backpropagation through time to compute gradients and update weights.
:param errors: The errors at each time step.
:param inputs: The input data as a list of one-hot encoded vectors.
"""
d_wf, d_bf = 0, 0
d_wi, d_bi = 0, 0
d_wc, d_bc = 0, 0
d_wo, d_bo = 0, 0
d_wy, d_by = 0, 0
dh_next, dc_next = (
np.zeros_like(self.hidden_states[0]),
np.zeros_like(self.cell_states[0]),
)
for t in reversed(range(len(inputs))):
error = errors[t]
# Final Gate Weights and Biases Errors
d_wy += np.dot(error, self.hidden_states[t].T)
d_by += error
# Hidden State Error
d_hs = np.dot(self.wy.T, error) + dh_next
# Output Gate Weights and Biases Errors
d_o = (
self.tanh(self.cell_states[t])
* d_hs
* self.sigmoid(self.output_gates[t], derivative=True)
)
d_wo += np.dot(d_o, inputs[t].T)
d_bo += d_o
# Cell State Error
d_cs = (
self.tanh(self.tanh(self.cell_states[t]), derivative=True)
* self.output_gates[t]
* d_hs
+ dc_next
)
# Forget Gate Weights and Biases Errors
d_f = (
d_cs
* self.cell_states[t - 1]
* self.sigmoid(self.forget_gates[t], derivative=True)
)
d_wf += np.dot(d_f, inputs[t].T)
d_bf += d_f
# Input Gate Weights and Biases Errors
d_i = (
d_cs
* self.candidate_gates[t]
* self.sigmoid(self.input_gates[t], derivative=True)
)
d_wi += np.dot(d_i, inputs[t].T)
d_bi += d_i
# Candidate Gate Weights and Biases Errors
d_c = (
d_cs
* self.input_gates[t]
* self.tanh(self.candidate_gates[t], derivative=True)
)
d_wc += np.dot(d_c, inputs[t].T)
d_bc += d_c
# Update the next hidden and cell state errors
dh_next = (
np.dot(self.wf.T, d_f)
+ np.dot(self.wi.T, d_i)
+ np.dot(self.wo.T, d_o)
+ np.dot(self.wc.T, d_c)
)
dc_next = d_cs * self.forget_gates[t]
# Apply gradients to weights and biases
for param, grad in zip(
[self.wf, self.wi, self.wc, self.wo, self.wy],
[d_wf, d_wi, d_wc, d_wo, d_wy],
):
param -= self.lr * grad
for param, grad in zip(
[self.bf, self.bi, self.bc, self.bo, self.by],
[d_bf, d_bi, d_bc, d_bo, d_by],
):
param -= self.lr * grad
def train(self) -> None:
"""
Train the LSTM network on the input data for a specified number of epochs.
"""
for epoch in tqdm(range(self.epochs)):
inputs = [self.one_hot_encode(char) for char in self.train_X]
targets = [self.one_hot_encode(char) for char in self.train_y]
# Forward pass
outputs = self.forward(inputs)
# Compute error at each time step
errors = [output - target for output, target in zip(outputs, targets)]
# Backward pass and weight updates
self.backward(errors, inputs)
def predict(self, inputs: list) -> str:
"""
Predict the next character in the sequence.
:param inputs: The input data as a list of one-hot encoded vectors.
:return: The predicted character.
"""
output = self.forward(inputs)[-1]
return self.idx_to_char[np.argmax(self.softmax(output))]
def test(self) -> None:
"""
Test the LSTM network on the input data and compute accuracy.
"""
inputs = [self.one_hot_encode(char) for char in self.train_X]
correct_predictions = sum(
self.idx_to_char[np.argmax(self.softmax(output))] == target
for output, target in zip(self.forward(inputs), self.train_y)
)
accuracy = (correct_predictions / len(self.train_y)) * 100
print(f"Accuracy: {accuracy:.2f}%")
if __name__ == "__main__":
# Define the input data and hyperparameters
data = "LSTM Neural Networks are designed to handle sequences of data."
hidden_dim = 50
epochs = 1000
lr = 0.01
# Initialize and train the LSTM network
lstm = LSTM(data, hidden_dim, epochs, lr)
lstm.train()
# Test the LSTM network and compute accuracy
lstm.test()