diff --git a/config.yaml b/config.yaml
index 513b437..8b29d93 100644
--- a/config.yaml
+++ b/config.yaml
@@ -47,15 +47,15 @@ preprocessing:
 predictor:
   type: lstm  # Options: 'lstm', 'fixed_input_nn'
   input_size: 1  # Input size for the LSTM predictor.
-  hidden_size: 16  # Hidden size for the LSTM or Fixed Input NN predictor.
+  hidden_size: 8 # 16  # Hidden size for the LSTM or Fixed Input NN predictor.
   num_layers: 2  # Number of layers for the LSTM predictor.
   fixed_input_size: 10  # Input size for the Fixed Input NN predictor. Only used if type is 'fixed_input_nn'.
 
 training:
-  epochs: 10  # Number of training epochs.
+  epochs: 128  # Number of training epochs.
   batch_size: 8  # Batch size for training.
   learning_rate: 0.001  # Learning rate for the optimizer.
-  eval_freq: 2  # Frequency of evaluation during training (in epochs).
+  eval_freq: 8  # Frequency of evaluation during training (in epochs).
   save_path: models  # Directory to save the best model and encoder.
   num_points: 1000  # Number of data points to visualize
 
@@ -68,4 +68,12 @@ data:
   split_ratio: 0.8  # Ratio to split the data into train and test sets.
 
 profiler:
-  enable: false
\ No newline at end of file
+  enable: false
+
+ablative:
+  training:
+    learning_rate: [0.01, 0.0001, 0.00001]
+    batch_size: [4, 16]
+  predictor:
+    hidden_size: [4, 16]
+    num_layers: [1, 3]
diff --git a/data_processing.py b/data_processing.py
index 453bd47..ba0e634 100644
--- a/data_processing.py
+++ b/data_processing.py
@@ -35,13 +35,12 @@ def delta_encode(data):
     """Apply delta encoding to the data."""
     deltas = [data[0]]
     for i in range(1, len(data)):
-        delta = np.subtract(data[i], data[i - 1], dtype=np.float32)  # Using numpy subtract to handle overflow
-        deltas.append(delta)
-    return deltas
+        deltas.append(data[i] - data[i - 1])
+    return np.array(deltas)
 
 def delta_decode(deltas):
     """Decode delta encoded data."""
     data = [deltas[0]]
     for i in range(1, len(deltas)):
         data.append(data[-1] + deltas[i])
-    return data
+    return np.array(data)
\ No newline at end of file
diff --git a/model.py b/model.py
index f6b2a95..dbe5f20 100644
--- a/model.py
+++ b/model.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 from abc import ABC, abstractmethod
 
-class BaseModel(nn.Module, ABC):
+class BaseModel(nn.Module):
     def __init__(self):
         super(BaseModel, self).__init__()
 
@@ -23,10 +23,12 @@ class LSTMPredictor(BaseModel):
         super(LSTMPredictor, self).__init__()
         self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
         self.fc = nn.Linear(hidden_size, 1)
+        self.hidden_size = hidden_size
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
     def forward(self, x):
-        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
-        c0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
+        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(self.device)
+        c0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(self.device)
         out, _ = self.rnn(x, (h0, c0))
         out = self.fc(out)
         return out
@@ -35,12 +37,13 @@ class LSTMPredictor(BaseModel):
         self.eval()
         encoded_data = []
 
+        context_size = self.hidden_size  # Define an appropriate context size
         with torch.no_grad():
             for i in range(len(data) - 1):
-                context = torch.tensor(data[max(0, i - self.rnn.hidden_size):i], dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(next(self.parameters()).device)
-                if context.shape[1] == 0:
-                    context = torch.zeros((1, 1, 1)).to(next(self.parameters()).device)
-                prediction = self.forward(context).cpu().numpy()[0][0]
+                context = torch.tensor(data[max(0, i - context_size):i]).reshape(1, -1, 1).to(self.device)
+                if context.size(1) == 0:  # Handle empty context
+                    continue
+                prediction = self.forward(context).squeeze(0).cpu().numpy()[0]
                 delta = data[i] - prediction
                 encoded_data.append(delta)
         
@@ -50,12 +53,13 @@ class LSTMPredictor(BaseModel):
         self.eval()
         decoded_data = []
 
+        context_size = self.hidden_size  # Define an appropriate context size
         with torch.no_grad():
             for i in range(len(encoded_data)):
-                context = torch.tensor(decoded_data[max(0, i - self.rnn.hidden_size):i], dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(next(self.parameters()).device)
-                if context.shape[1] == 0:
-                    context = torch.zeros((1, 1, 1)).to(next(self.parameters()).device)
-                prediction = self.forward(context).cpu().numpy()[0][0]
+                context = torch.tensor(decoded_data[max(0, i - context_size):i]).reshape(1, -1, 1).to(self.device)
+                if context.size(1) == 0:  # Handle empty context
+                    continue
+                prediction = self.forward(context).squeeze(0).cpu().numpy()[0]
                 decoded_data.append(prediction + encoded_data[i])
         
         return decoded_data
@@ -66,6 +70,7 @@ class FixedInputNNPredictor(BaseModel):
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, 1)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
     def forward(self, x):
         x = self.fc1(x)
@@ -77,11 +82,14 @@ class FixedInputNNPredictor(BaseModel):
         self.eval()
         encoded_data = []
 
+        context_size = self.fc1.in_features  # Define an appropriate context size
         with torch.no_grad():
-            for i in range(len(data) - self.fc1.in_features):
-                context = torch.tensor(data[i:i + self.fc1.in_features], dtype=torch.float32).unsqueeze(0).to(next(self.parameters()).device)
-                prediction = self.forward(context).cpu().numpy()[0][0]
-                delta = data[i + self.fc1.in_features] - prediction
+            for i in range(len(data) - context_size):
+                context = torch.tensor(data[i:i + context_size]).reshape(1, -1).to(self.device)
+                if context.size(1) == 0:  # Handle empty context
+                    continue
+                prediction = self.forward(context).squeeze(0).cpu().numpy()[0]
+                delta = data[i + context_size] - prediction
                 encoded_data.append(delta)
         
         return encoded_data
@@ -90,10 +98,13 @@ class FixedInputNNPredictor(BaseModel):
         self.eval()
         decoded_data = []
 
+        context_size = self.fc1.in_features  # Define an appropriate context size
         with torch.no_grad():
             for i in range(len(encoded_data)):
-                context = torch.tensor(decoded_data[max(0, i - self.fc1.in_features):i], dtype=torch.float32).unsqueeze(0).to(next(self.parameters()).device)
-                prediction = self.forward(context).cpu().numpy()[0][0]
+                context = torch.tensor(decoded_data[max(0, i - context_size):i]).reshape(1, -1).to(self.device)
+                if context.size(1) == 0:  # Handle empty context
+                    continue
+                prediction = self.forward(context).squeeze(0).cpu().numpy()[0]
                 decoded_data.append(prediction + encoded_data[i])
         
         return decoded_data
diff --git a/requirements.txt b/requirements.txt
index 5ffc755..664f79c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,6 @@ matplotlib
 wandb
 pyyaml
 arithmetic_compressor
+pycallgraph2
+setuptools
+wheel
diff --git a/train.py b/train.py
index 24fa28d..5eb0dc6 100644
--- a/train.py
+++ b/train.py
@@ -10,83 +10,90 @@ from data_processing import delta_encode, delta_decode, save_wav
 from utils import visualize_prediction, plot_delta_distribution
 from bitstream import ArithmeticEncoder
 
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def pad_sequence(sequence, max_length):
+    padded_seq = np.zeros((max_length, *sequence.shape[1:]))
+    padded_seq[:sequence.shape[0], ...] = sequence
+    return padded_seq
 
 def evaluate_model(model, data, use_delta_encoding, encoder, sample_rate=19531, epoch=0):
     compression_ratios = []
     identical_count = 0
     all_deltas = []
 
-    model.eval()
-    for file_data in data:
-        file_data = torch.tensor(file_data, dtype=torch.float32).unsqueeze(1).to(device)
+    for i, file_data in enumerate(data):
+        file_data = torch.tensor(file_data, dtype=torch.float32).unsqueeze(1).to(model.device)
         encoded_data = model.encode(file_data.squeeze(1).cpu().numpy())
         encoder.build_model(encoded_data)
         compressed_data = encoder.encode(encoded_data)
         decompressed_data = encoder.decode(compressed_data, len(encoded_data))
         
-        # Check equivalence
         if use_delta_encoding:
             decompressed_data = delta_decode(decompressed_data)
+        
+        # Ensure the lengths match
+        min_length = min(len(file_data), len(decompressed_data))
+        file_data = file_data[:min_length]
+        decompressed_data = decompressed_data[:min_length]
+
         identical = np.allclose(file_data.cpu().numpy(), decompressed_data, atol=1e-5)
         if identical:
             identical_count += 1
 
         compression_ratio = len(file_data) / len(compressed_data)
         compression_ratios.append(compression_ratio)
-        
-        # Compute and collect deltas
-        predicted_data = model.decode(encoded_data)
+
+        predicted_data = model(torch.tensor(encoded_data, dtype=torch.float32).unsqueeze(1).to(model.device)).squeeze(1).detach().cpu().numpy()
         if use_delta_encoding:
             predicted_data = delta_decode(predicted_data)
-        delta_data = [file_data[i].item() - predicted_data[i] for i in range(len(file_data))]
+        
+        # Ensure predicted_data is a flat list of floats
+        predicted_data = predicted_data[:min_length]
+
+        delta_data = [file_data[i].item() - predicted_data[i] for i in range(min_length)]
         all_deltas.extend(delta_data)
         
-        # Visualize prediction vs data vs error
-        visualize_prediction(file_data.cpu().numpy(), predicted_data, delta_data, sample_rate)
+        if i == (epoch % len(data)):
+            visualize_prediction(file_data.cpu().numpy(), predicted_data, delta_data, sample_rate, epoch=epoch)
 
     identical_percentage = (identical_count / len(data)) * 100
-    
-    # Plot delta distribution
     delta_plot_path = plot_delta_distribution(all_deltas, epoch)
-    wandb.log({"delta_distribution": wandb.Image(delta_plot_path)})
+    wandb.log({"delta_distribution": wandb.Image(delta_plot_path)}, step=epoch)
     
     return compression_ratios, identical_percentage
 
 def train_model(model, train_data, test_data, epochs, batch_size, learning_rate, use_delta_encoding, encoder, eval_freq, save_path):
-    """Train the model."""
     wandb.init(project="wav-compression")
     criterion = nn.MSELoss()
     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
     best_test_score = float('inf')
-    model = model.to(device)
-    
+
+    model.to(model.device)
+
+    max_length = max([len(seq) for seq in train_data])
+    print(f"Max sequence length: {max_length}")
+
     for epoch in range(epochs):
-        model.train()
         total_loss = 0
-        random.shuffle(train_data)  # Shuffle data for varied batches
+        random.shuffle(train_data)
         for i in range(0, len(train_data) - batch_size, batch_size):
-            batch = train_data[i:i+batch_size]
-            max_len = max(len(seq) for seq in batch)
-            padded_batch = np.array([np.pad(seq, (0, max_len - len(seq))) for seq in batch], dtype=np.float32)
-            inputs = torch.tensor(padded_batch[:, :-1], dtype=torch.float32).unsqueeze(2).to(device)
-            targets = torch.tensor(padded_batch[:, 1:], dtype=torch.float32).unsqueeze(2).to(device)
+            batch_data = [pad_sequence(np.array(train_data[j]), max_length) for j in range(i, i+batch_size)]
+            batch_data = np.array(batch_data)
+            inputs = torch.tensor(batch_data, dtype=torch.float32).unsqueeze(2).to(model.device)
+            targets = torch.tensor(batch_data, dtype=torch.float32).unsqueeze(2).to(model.device)
             outputs = model(inputs)
             loss = criterion(outputs, targets)
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
             total_loss += loss.item()
-        
-        wandb.log({"epoch": epoch, "loss": total_loss})
+
+        wandb.log({"epoch": epoch, "loss": total_loss}, step=epoch)
         print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss}')
         
         if (epoch + 1) % eval_freq == 0:
-            # Evaluate on train and test data
             train_compression_ratios, train_identical_percentage = evaluate_model(model, train_data, use_delta_encoding, encoder, epoch=epoch)
             test_compression_ratios, test_identical_percentage = evaluate_model(model, test_data, use_delta_encoding, encoder, epoch=epoch)
             
-            # Log statistics
             wandb.log({
                 "train_compression_ratio_mean": np.mean(train_compression_ratios),
                 "train_compression_ratio_std": np.std(train_compression_ratios),
@@ -98,12 +105,11 @@ def train_model(model, train_data, test_data, epochs, batch_size, learning_rate,
                 "test_compression_ratio_max": np.max(test_compression_ratios),
                 "train_identical_percentage": train_identical_percentage,
                 "test_identical_percentage": test_identical_percentage,
-            })
+            }, step=epoch)
             
             print(f'Epoch {epoch+1}/{epochs}, Train Compression Ratio: Mean={np.mean(train_compression_ratios)}, Std={np.std(train_compression_ratios)}, Min={np.min(train_compression_ratios)}, Max={np.max(train_compression_ratios)}, Identical={train_identical_percentage}%')
             print(f'Epoch {epoch+1}/{epochs}, Test Compression Ratio: Mean={np.mean(test_compression_ratios)}, Std={np.std(test_compression_ratios)}, Min={np.min(test_compression_ratios)}, Max={np.max(test_compression_ratios)}, Identical={test_identical_percentage}%')
             
-            # Save model and encoder if new highscore on test data
             test_score = np.mean(test_compression_ratios)
             if test_score < best_test_score:
                 best_test_score = test_score
diff --git a/utils.py b/utils.py
index 2932217..49cf63e 100644
--- a/utils.py
+++ b/utils.py
@@ -14,7 +14,7 @@ def visualize_wav_data(sample_rate, data, title="WAV Data", num_points=None):
     plt.ylabel('Amplitude')
     plt.show()
 
-def visualize_prediction(true_data, predicted_data, delta_data, sample_rate, num_points=None):
+def visualize_prediction(true_data, predicted_data, delta_data, sample_rate, num_points=None, epoch=None):
     """Visualize the true data, predicted data, and deltas."""
     if num_points:
         true_data = true_data[:num_points]
@@ -46,7 +46,7 @@ def visualize_prediction(true_data, predicted_data, delta_data, sample_rate, num
     file_path = os.path.join(tmp_dir, f'prediction_plot_{np.random.randint(1e6)}.png')
     plt.savefig(file_path)
     plt.close()
-    wandb.log({"Prediction vs True Data": wandb.Image(file_path)})
+    wandb.log({"Prediction vs True Data": wandb.Image(file_path)}, step=epoch)
 
 def plot_delta_distribution(deltas, epoch):
     """Plot the distribution of deltas."""