|
| 1 | +""" |
| 2 | +Recurrent Neural Network (RNN) from scratch using NumPy. |
| 3 | +
|
| 4 | +This implementation demonstrates a vanilla RNN for sequence classification |
| 5 | +trained on synthetic sequential data. It includes forward propagation through |
| 6 | +time, backpropagation through time (BPTT), and gradient clipping. |
| 7 | +
|
| 8 | +Part of Cosmos by OpenGenus Foundation. |
| 9 | +""" |
| 10 | + |
| 11 | +import numpy as np |
| 12 | + |
| 13 | + |
| 14 | +class RNN: |
| 15 | + """A vanilla Recurrent Neural Network for sequence classification.""" |
| 16 | + |
| 17 | + def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01): |
| 18 | + """ |
| 19 | + Initialize RNN parameters. |
| 20 | +
|
| 21 | + Args: |
| 22 | + input_size: Dimension of input at each time step. |
| 23 | + hidden_size: Number of hidden units. |
| 24 | + output_size: Number of output classes. |
| 25 | + learning_rate: Step size for gradient descent. |
| 26 | + """ |
| 27 | + self.hidden_size = hidden_size |
| 28 | + self.learning_rate = learning_rate |
| 29 | + |
| 30 | + # Small-scale initialization for RNN stability |
| 31 | + scale = 0.01 |
| 32 | + self.Wxh = np.random.randn(input_size, hidden_size) * scale |
| 33 | + self.Whh = np.random.randn(hidden_size, hidden_size) * scale |
| 34 | + self.Why = np.random.randn(hidden_size, output_size) * scale |
| 35 | + |
| 36 | + self.bh = np.zeros((1, hidden_size)) |
| 37 | + self.by = np.zeros((1, output_size)) |
| 38 | + |
| 39 | + def _tanh(self, x): |
| 40 | + return np.tanh(x) |
| 41 | + |
| 42 | + def _softmax(self, x): |
| 43 | + exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) |
| 44 | + return exp_x / np.sum(exp_x, axis=1, keepdims=True) |
| 45 | + |
| 46 | + def forward(self, inputs): |
| 47 | + """ |
| 48 | + Forward pass through the sequence. |
| 49 | +
|
| 50 | + Args: |
| 51 | + inputs: Array of shape (sequence_length, batch_size, input_size). |
| 52 | +
|
| 53 | + Returns: |
| 54 | + output: Softmax probabilities of shape (batch_size, output_size). |
| 55 | + hidden_states: List of hidden states at each time step. |
| 56 | + """ |
| 57 | + batch_size = inputs.shape[1] |
| 58 | + h = np.zeros((batch_size, self.hidden_size)) |
| 59 | + hidden_states = [h] |
| 60 | + |
| 61 | + for t in range(inputs.shape[0]): |
| 62 | + x_t = inputs[t] |
| 63 | + h = self._tanh(x_t @ self.Wxh + h @ self.Whh + self.bh) |
| 64 | + hidden_states.append(h) |
| 65 | + |
| 66 | + output = self._softmax(h @ self.Why + self.by) |
| 67 | + return output, hidden_states |
| 68 | + |
| 69 | + def backward(self, inputs, hidden_states, output, labels): |
| 70 | + """ |
| 71 | + Backpropagation through time (BPTT). |
| 72 | +
|
| 73 | + Args: |
| 74 | + inputs: Input sequence (sequence_length, batch_size, input_size). |
| 75 | + hidden_states: Hidden states from forward pass. |
| 76 | + output: Predicted probabilities (batch_size, output_size). |
| 77 | + labels: One-hot encoded labels (batch_size, output_size). |
| 78 | +
|
| 79 | + Returns: |
| 80 | + loss: Cross-entropy loss value. |
| 81 | + """ |
| 82 | + batch_size = inputs.shape[1] |
| 83 | + seq_len = inputs.shape[0] |
| 84 | + |
| 85 | + # Cross-entropy loss |
| 86 | + loss = -np.sum(labels * np.log(output + 1e-8)) / batch_size |
| 87 | + |
| 88 | + # Gradient of loss w.r.t. output |
| 89 | + dy = (output - labels) / batch_size |
| 90 | + |
| 91 | + # Gradients for output layer |
| 92 | + dWhy = hidden_states[-1].T @ dy |
| 93 | + dby = np.sum(dy, axis=0, keepdims=True) |
| 94 | + |
| 95 | + # Backpropagate through time |
| 96 | + dWxh = np.zeros_like(self.Wxh) |
| 97 | + dWhh = np.zeros_like(self.Whh) |
| 98 | + dbh = np.zeros_like(self.bh) |
| 99 | + |
| 100 | + dh_next = dy @ self.Why.T |
| 101 | + |
| 102 | + for t in reversed(range(seq_len)): |
| 103 | + # Gradient through tanh: d_tanh = (1 - tanh^2) * upstream |
| 104 | + dtanh = (1 - hidden_states[t + 1] ** 2) * dh_next |
| 105 | + |
| 106 | + dWxh += inputs[t].T @ dtanh |
| 107 | + dWhh += hidden_states[t].T @ dtanh |
| 108 | + dbh += np.sum(dtanh, axis=0, keepdims=True) |
| 109 | + |
| 110 | + dh_next = dtanh @ self.Whh.T |
| 111 | + |
| 112 | + # Gradient clipping to prevent exploding gradients |
| 113 | + for grad in [dWxh, dWhh, dWhy, dbh, dby]: |
| 114 | + np.clip(grad, -5, 5, out=grad) |
| 115 | + |
| 116 | + # Update parameters |
| 117 | + self.Wxh -= self.learning_rate * dWxh |
| 118 | + self.Whh -= self.learning_rate * dWhh |
| 119 | + self.Why -= self.learning_rate * dWhy |
| 120 | + self.bh -= self.learning_rate * dbh |
| 121 | + self.by -= self.learning_rate * dby |
| 122 | + |
| 123 | + return loss |
| 124 | + |
| 125 | + def train(self, X_train, y_train, epochs=100, batch_size=32, verbose=True): |
| 126 | + """ |
| 127 | + Train the RNN on sequential data using mini-batches. |
| 128 | +
|
| 129 | + Args: |
| 130 | + X_train: Training data (num_samples, sequence_length, input_size). |
| 131 | + y_train: Labels as integers (num_samples,). |
| 132 | + epochs: Number of training epochs. |
| 133 | + batch_size: Number of samples per mini-batch. |
| 134 | + verbose: Whether to print loss during training. |
| 135 | + """ |
| 136 | + num_classes = int(np.max(y_train)) + 1 |
| 137 | + num_samples = X_train.shape[0] |
| 138 | + # One-hot encode labels |
| 139 | + all_labels = np.eye(num_classes)[y_train.astype(int)] |
| 140 | + |
| 141 | + for epoch in range(epochs): |
| 142 | + # Shuffle data each epoch |
| 143 | + perm = np.random.permutation(num_samples) |
| 144 | + X_shuffled = X_train[perm] |
| 145 | + y_shuffled = y_train[perm] |
| 146 | + labels_shuffled = all_labels[perm] |
| 147 | + |
| 148 | + epoch_loss = 0.0 |
| 149 | + num_batches = 0 |
| 150 | + |
| 151 | + for start in range(0, num_samples, batch_size): |
| 152 | + end = min(start + batch_size, num_samples) |
| 153 | + X_batch = X_shuffled[start:end].transpose(1, 0, 2) |
| 154 | + labels_batch = labels_shuffled[start:end] |
| 155 | + |
| 156 | + output, hidden_states = self.forward(X_batch) |
| 157 | + loss = self.backward(X_batch, hidden_states, output, labels_batch) |
| 158 | + epoch_loss += loss |
| 159 | + num_batches += 1 |
| 160 | + |
| 161 | + if verbose and (epoch + 1) % 20 == 0: |
| 162 | + predictions = self.predict(X_train, batch_size) |
| 163 | + accuracy = np.mean(predictions == y_train) * 100 |
| 164 | + avg_loss = epoch_loss / num_batches |
| 165 | + print( |
| 166 | + f"Epoch {epoch + 1}/{epochs} - " |
| 167 | + f"Loss: {avg_loss:.4f} - Accuracy: {accuracy:.1f}%" |
| 168 | + ) |
| 169 | + |
| 170 | + def predict(self, X, batch_size=32): |
| 171 | + """ |
| 172 | + Predict class labels for input sequences. |
| 173 | +
|
| 174 | + Args: |
| 175 | + X: Input data (num_samples, sequence_length, input_size). |
| 176 | + batch_size: Number of samples per forward pass. |
| 177 | +
|
| 178 | + Returns: |
| 179 | + Predicted class labels (num_samples,). |
| 180 | + """ |
| 181 | + all_preds = [] |
| 182 | + for start in range(0, X.shape[0], batch_size): |
| 183 | + end = min(start + batch_size, X.shape[0]) |
| 184 | + inputs = X[start:end].transpose(1, 0, 2) |
| 185 | + output, _ = self.forward(inputs) |
| 186 | + all_preds.append(np.argmax(output, axis=1)) |
| 187 | + return np.concatenate(all_preds) |
| 188 | + |
| 189 | + |
| 190 | +def generate_synthetic_data(num_samples=500, seq_length=10, input_size=3): |
| 191 | + """ |
| 192 | + Generate synthetic sequential data for binary classification. |
| 193 | + Class 0: sequences where values tend to increase over time. |
| 194 | + Class 1: sequences where values tend to decrease over time. |
| 195 | + """ |
| 196 | + X = np.zeros((num_samples, seq_length, input_size)) |
| 197 | + y = np.zeros(num_samples) |
| 198 | + |
| 199 | + for i in range(num_samples): |
| 200 | + if i < num_samples // 2: |
| 201 | + # Increasing trend |
| 202 | + for t in range(seq_length): |
| 203 | + X[i, t] = np.random.randn(input_size) * 0.5 + t * 0.3 |
| 204 | + y[i] = 0 |
| 205 | + else: |
| 206 | + # Decreasing trend |
| 207 | + for t in range(seq_length): |
| 208 | + X[i, t] = np.random.randn(input_size) * 0.5 - t * 0.3 |
| 209 | + y[i] = 1 |
| 210 | + |
| 211 | + # Shuffle |
| 212 | + indices = np.random.permutation(num_samples) |
| 213 | + return X[indices], y[indices] |
| 214 | + |
| 215 | + |
| 216 | +if __name__ == "__main__": |
| 217 | + np.random.seed(42) |
| 218 | + # Suppress expected NumPy warnings from early gradient steps |
| 219 | + np.seterr(over="ignore", invalid="ignore", divide="ignore") |
| 220 | + |
| 221 | + # Generate data |
| 222 | + X, y = generate_synthetic_data(num_samples=500, seq_length=10, input_size=3) |
| 223 | + |
| 224 | + # Split into train and test |
| 225 | + split = int(0.8 * len(X)) |
| 226 | + X_train, X_test = X[:split], X[split:] |
| 227 | + y_train, y_test = y[:split], y[split:] |
| 228 | + |
| 229 | + # Normalize |
| 230 | + mean = X_train.mean() |
| 231 | + std = X_train.std() |
| 232 | + X_train = (X_train - mean) / std |
| 233 | + X_test = (X_test - mean) / std |
| 234 | + |
| 235 | + # Create and train the RNN |
| 236 | + rnn = RNN(input_size=3, hidden_size=16, output_size=2, learning_rate=0.005) |
| 237 | + print("Training RNN on synthetic sequential data...\n") |
| 238 | + rnn.train(X_train, y_train, epochs=200) |
| 239 | + |
| 240 | + # Evaluate |
| 241 | + predictions = rnn.predict(X_test) |
| 242 | + test_accuracy = np.mean(predictions == y_test) * 100 |
| 243 | + print(f"\nTest Accuracy: {test_accuracy:.1f}%") |
0 commit comments