From e0f51b5ee045015dbdf702978e4bce66230529e3 Mon Sep 17 00:00:00 2001
From: Dominik Roth <dominik.roth.dev@gmail.com>
Date: Fri, 24 May 2024 23:02:24 +0200
Subject: [PATCH] 2nd commit

---
 README.md          | 34 ---------------------------
 bitstream.py       | 23 +++++++++++++++++++
 config.yaml        | 18 +++++++++------
 data_processing.py |  5 ++--
 main.py            | 57 ++++++++++++++++++++++++++++++++--------------
 model.py           | 35 ++++++++++++++--------------
 requirements.txt   |  3 ++-
 train.py           | 32 ++++++++++++++------------
 8 files changed, 114 insertions(+), 93 deletions(-)

diff --git a/README.md b/README.md
index ce1ed85..86bfc47 100644
--- a/README.md
+++ b/README.md
@@ -18,40 +18,6 @@ pip install -r requirements.txt
 
 ## Usage
 
-### Configuration
-
-The configuration for training and evaluation is specified in a YAML file. Below is an example configuration:
-
-```yaml
-name: Test
-
-preprocessing:
-  use_delta_encoding: true # Whether to use delta encoding.
-
-predictor:
-  type: lstm # Options: 'lstm', 'fixed_input_nn'
-  input_size: 1 # Input size for the LSTM predictor.
-  hidden_size: 128 # Hidden size for the LSTM or Fixed Input NN predictor.
-  num_layers: 2 # Number of layers for the LSTM predictor.
-  fixed_input_size: 10 # Input size for the Fixed Input NN predictor. Only used if type is 'fixed_input_nn'.
-
-training:
-  epochs: 10 # Number of training epochs.
-  batch_size: 32 # Batch size for training.
-  learning_rate: 0.001 # Learning rate for the optimizer.
-  eval_freq: 2 # Frequency of evaluation during training (in epochs).
-  save_path: models # Directory to save the best model and encoder.
-  num_points: 1000 # Number of data points to visualize.
-
-bitstream_encoding:
-  type: arithmetic # Use arithmetic encoding.
-
-data:
-  url: https://content.neuralink.com/compression-challenge/data.zip # URL to download the dataset.
-  directory: data # Directory to extract and store the dataset.
-  split_ratio: 0.8 # Ratio to split the data into train and test sets.
-```
-
 ### Running the Code
 
 To train the model and compress/decompress WAV files, use the CLI provided:
diff --git a/bitstream.py b/bitstream.py
index 8ce2fa0..bad7ce6 100644
--- a/bitstream.py
+++ b/bitstream.py
@@ -1,3 +1,4 @@
+import bz2
 from abc import ABC, abstractmethod
 from arithmetic_compressor import AECompressor
 from arithmetic_compressor.models import StaticModel
@@ -15,6 +16,16 @@ class BaseEncoder(ABC):
     def build_model(self, data):
         pass
 
+class IdentityEncoder(BaseEncoder):
+    def encode(self, data):
+        return data
+
+    def decode(self, encoded_data, num_symbols):
+        return encoded_data
+
+    def build_model(self, data):
+        pass
+
 class ArithmeticEncoder(BaseEncoder):
     def encode(self, data):
         if not hasattr(self, 'model'):
@@ -29,7 +40,19 @@ class ArithmeticEncoder(BaseEncoder):
         return decoded_data
 
     def build_model(self, data):
+        # Convert data to list of tuples
+        data = [tuple(d) for d in data]
         symbol_counts = {symbol: data.count(symbol) for symbol in set(data)}
         total_symbols = sum(symbol_counts.values())
         probabilities = {symbol: count / total_symbols for symbol, count in symbol_counts.items()}
         self.model = StaticModel(probabilities)
+
+class Bzip2Encoder(BaseEncoder):
+    def encode(self, data):
+        return bz2.compress(bytearray(data))
+
+    def decode(self, encoded_data, num_symbols):
+        return list(bz2.decompress(encoded_data))
+
+    def build_model(self, data):
+        pass
diff --git a/config.yaml b/config.yaml
index 91a9015..513b437 100644
--- a/config.yaml
+++ b/config.yaml
@@ -30,38 +30,42 @@ wandb:
   group: '{config[name]}'
   job_type: '{delta_desc}'
   name: '{job_id}_{task_id}:{run_id}:{rand}={config[name]}_{delta_desc}'
-  tags:
-    - '{config[env][name]}'
-    - '{config[algo][name]}'
+  #tags:
+  #  - '{config[env][name]}'
+  #  - '{config[algo][name]}'
   sync_tensorboard: False
   monitor_gym: False
   save_code: False
 
 ---
 name: Test
+import: $
 
 preprocessing:
-  use_delta_encoding: true  # Whether to use delta encoding.
+  use_delta_encoding: false  # Whether to use delta encoding.
 
 predictor:
   type: lstm  # Options: 'lstm', 'fixed_input_nn'
   input_size: 1  # Input size for the LSTM predictor.
-  hidden_size: 128  # Hidden size for the LSTM or Fixed Input NN predictor.
+  hidden_size: 16  # Hidden size for the LSTM or Fixed Input NN predictor.
   num_layers: 2  # Number of layers for the LSTM predictor.
   fixed_input_size: 10  # Input size for the Fixed Input NN predictor. Only used if type is 'fixed_input_nn'.
 
 training:
   epochs: 10  # Number of training epochs.
-  batch_size: 32  # Batch size for training.
+  batch_size: 8  # Batch size for training.
   learning_rate: 0.001  # Learning rate for the optimizer.
   eval_freq: 2  # Frequency of evaluation during training (in epochs).
   save_path: models  # Directory to save the best model and encoder.
   num_points: 1000  # Number of data points to visualize
 
 bitstream_encoding:
-  type: arithmetic  # Use arithmetic encoding.
+  type: identity  # Options: 'arithmetic', 'no_compression', 'bzip2'
 
 data:
   url: https://content.neuralink.com/compression-challenge/data.zip  # URL to download the dataset.
   directory: data  # Directory to extract and store the dataset.
   split_ratio: 0.8  # Ratio to split the data into train and test sets.
+
+profiler:
+  enable: false
\ No newline at end of file
diff --git a/data_processing.py b/data_processing.py
index 5311105..453bd47 100644
--- a/data_processing.py
+++ b/data_processing.py
@@ -1,8 +1,8 @@
-import os
 import numpy as np
 from scipy.io import wavfile
 import urllib.request
 import zipfile
+import os
 
 def download_and_extract_data(url, data_dir):
     if not os.path.exists(data_dir):
@@ -35,7 +35,8 @@ def delta_encode(data):
     """Apply delta encoding to the data."""
     deltas = [data[0]]
     for i in range(1, len(data)):
-        deltas.append(data[i] - data[i - 1])
+        delta = np.subtract(data[i], data[i - 1], dtype=np.float32)  # Using numpy subtract to handle overflow
+        deltas.append(delta)
     return deltas
 
 def delta_decode(deltas):
diff --git a/main.py b/main.py
index 5bee185..f6232b1 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,12 @@
-import yaml
 from slate import Slate, Slate_Runner
+
+from pycallgraph2 import PyCallGraph
+from pycallgraph2.output import GraphvizOutput
+
 from data_processing import download_and_extract_data, load_all_wavs, delta_encode
 from model import LSTMPredictor, FixedInputNNPredictor
 from train import train_model
-from bitstream import ArithmeticEncoder
+from bitstream import ArithmeticEncoder, IdentityEncoder, Bzip2Encoder
 
 class SpikeRunner(Slate_Runner):
     def setup(self, name):
@@ -23,7 +26,14 @@ class SpikeRunner(Slate_Runner):
         download_and_extract_data(data_url, data_dir)
         all_data = load_all_wavs(data_dir)
         
-        if slate.consume(preprocessing_config, 'use_delta_encoding'):
+        self.epochs = slate.consume(training_config, 'epochs')
+        self.batch_size = slate.consume(training_config, 'batch_size')
+        self.learning_rate = slate.consume(training_config, 'learning_rate')
+        self.use_delta_encoding = slate.consume(preprocessing_config, 'use_delta_encoding')
+        self.eval_freq = slate.consume(training_config, 'eval_freq')
+        self.save_path = slate.consume(training_config, 'save_path', 'models')
+
+        if self.use_delta_encoding:
             all_data = [delta_encode(d) for d in all_data]
 
         # Split data into train and test sets
@@ -35,34 +45,47 @@ class SpikeRunner(Slate_Runner):
         # Model setup
         self.model = self.get_model(predictor_config)
         self.encoder = self.get_encoder(bitstream_config)
-        self.epochs = slate.consume(training_config, 'epochs')
-        self.batch_size = slate.consume(training_config, 'batch_size')
-        self.learning_rate = slate.consume(training_config, 'learning_rate')
-        self.use_delta_encoding = slate.consume(preprocessing_config, 'use_delta_encoding')
-        self.eval_freq = slate.consume(training_config, 'eval_freq')
-        self.save_path = slate.consume(training_config, 'save_path', 'models')
 
     def get_model(self, config):
-        model_type = self.slate.consume(config, 'type')
+        model_type = slate.consume(config, 'type')
         if model_type == 'lstm':
             return LSTMPredictor(
-                input_size=self.slate.consume(config, 'input_size'), 
-                hidden_size=self.slate.consume(config, 'hidden_size'), 
-                num_layers=self.slate.consume(config, 'num_layers')
+                input_size=slate.consume(config, 'input_size'), 
+                hidden_size=slate.consume(config, 'hidden_size'), 
+                num_layers=slate.consume(config, 'num_layers')
             )
         elif model_type == 'fixed_input_nn':
             return FixedInputNNPredictor(
-                input_size=self.slate.consume(config, 'fixed_input_size'), 
-                hidden_size=self.slate.consume(config, 'hidden_size')
+                input_size=slate.consume(config, 'fixed_input_size'), 
+                hidden_size=slate.consume(config, 'hidden_size')
             )
         else:
             raise ValueError(f"Unknown model type: {model_type}")
 
     def get_encoder(self, config):
-        return ArithmeticEncoder()
+        encoder_type = slate.consume(config, 'type')
+        if encoder_type == 'arithmetic':
+            return ArithmeticEncoder()
+        elif encoder_type == 'identity':
+            return IdentityEncoder()
+        elif encoder_type == 'bzip2':
+            return Bzip2Encoder()
+        else:
+            raise ValueError(f"Unknown encoder type: {encoder_type}")
 
     def run(self, run, forceNoProfile=False):
-        train_model(self.model, self.train_data, self.test_data, self.epochs, self.batch_size, self.learning_rate, self.use_delta_encoding, self.encoder, self.eval_freq, self.save_path)
+        if self.slate.consume(self.config, 'profiler.enable', False) and not forceNoProfile:
+            print('{PROFILER RUNNING}')
+            with PyCallGraph(output=GraphvizOutput(output_file=f'./profiler/{self.name}.png')):
+                self.run(run, forceNoProfile=True)
+            print('{PROFILER DONE}')
+            return
+
+        train_model(
+            self.model, self.train_data, self.test_data, 
+            self.epochs, self.batch_size, self.learning_rate, 
+            self.use_delta_encoding, self.encoder, self.eval_freq, self.save_path
+        )
 
 if __name__ == '__main__':
     slate = Slate({'spikey': SpikeRunner})
diff --git a/model.py b/model.py
index a0e81c5..f6b2a95 100644
--- a/model.py
+++ b/model.py
@@ -2,9 +2,9 @@ import torch
 import torch.nn as nn
 from abc import ABC, abstractmethod
 
-class BaseModel(ABC, nn.Module):
+class BaseModel(nn.Module, ABC):
     def __init__(self):
-        super().__init__()
+        super(BaseModel, self).__init__()
 
     @abstractmethod
     def forward(self, x):
@@ -23,12 +23,10 @@ class LSTMPredictor(BaseModel):
         super(LSTMPredictor, self).__init__()
         self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
         self.fc = nn.Linear(hidden_size, 1)
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
 
     def forward(self, x):
-        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
-        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
+        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
+        c0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
         out, _ = self.rnn(x, (h0, c0))
         out = self.fc(out)
         return out
@@ -39,8 +37,10 @@ class LSTMPredictor(BaseModel):
 
         with torch.no_grad():
             for i in range(len(data) - 1):
-                context = torch.tensor(data[max(0, i - self.hidden_size):i]).view(1, -1, 1).float()
-                prediction = self.forward(context).item()
+                context = torch.tensor(data[max(0, i - self.rnn.hidden_size):i], dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(next(self.parameters()).device)
+                if context.shape[1] == 0:
+                    context = torch.zeros((1, 1, 1)).to(next(self.parameters()).device)
+                prediction = self.forward(context).cpu().numpy()[0][0]
                 delta = data[i] - prediction
                 encoded_data.append(delta)
         
@@ -52,8 +52,10 @@ class LSTMPredictor(BaseModel):
 
         with torch.no_grad():
             for i in range(len(encoded_data)):
-                context = torch.tensor(decoded_data[max(0, i - self.hidden_size):i]).view(1, -1, 1).float()
-                prediction = self.forward(context).item()
+                context = torch.tensor(decoded_data[max(0, i - self.rnn.hidden_size):i], dtype=torch.float32).unsqueeze(0).unsqueeze(2).to(next(self.parameters()).device)
+                if context.shape[1] == 0:
+                    context = torch.zeros((1, 1, 1)).to(next(self.parameters()).device)
+                prediction = self.forward(context).cpu().numpy()[0][0]
                 decoded_data.append(prediction + encoded_data[i])
         
         return decoded_data
@@ -64,7 +66,6 @@ class FixedInputNNPredictor(BaseModel):
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, 1)
-        self.input_size = input_size
 
     def forward(self, x):
         x = self.fc1(x)
@@ -77,10 +78,10 @@ class FixedInputNNPredictor(BaseModel):
         encoded_data = []
 
         with torch.no_grad():
-            for i in range(len(data) - self.input_size):
-                context = torch.tensor(data[i:i + self.input_size]).view(1, -1).float()
-                prediction = self.forward(context).item()
-                delta = data[i + self.input_size] - prediction
+            for i in range(len(data) - self.fc1.in_features):
+                context = torch.tensor(data[i:i + self.fc1.in_features], dtype=torch.float32).unsqueeze(0).to(next(self.parameters()).device)
+                prediction = self.forward(context).cpu().numpy()[0][0]
+                delta = data[i + self.fc1.in_features] - prediction
                 encoded_data.append(delta)
         
         return encoded_data
@@ -91,8 +92,8 @@ class FixedInputNNPredictor(BaseModel):
 
         with torch.no_grad():
             for i in range(len(encoded_data)):
-                context = torch.tensor(decoded_data[max(0, i - self.input_size):i]).view(1, -1).float()
-                prediction = self.forward(context).item()
+                context = torch.tensor(decoded_data[max(0, i - self.fc1.in_features):i], dtype=torch.float32).unsqueeze(0).to(next(self.parameters()).device)
+                prediction = self.forward(context).cpu().numpy()[0][0]
                 decoded_data.append(prediction + encoded_data[i])
         
         return decoded_data
diff --git a/requirements.txt b/requirements.txt
index ba358e8..5ffc755 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ numpy
 scipy
 matplotlib
 wandb
-pyyaml
\ No newline at end of file
+pyyaml
+arithmetic_compressor
diff --git a/train.py b/train.py
index 3cf4cf9..24fa28d 100644
--- a/train.py
+++ b/train.py
@@ -10,17 +10,17 @@ from data_processing import delta_encode, delta_decode, save_wav
 from utils import visualize_prediction, plot_delta_distribution
 from bitstream import ArithmeticEncoder
 
-def evaluate_model(model, data, use_delta_encoding, encoder, sample_rate=19531, epoch=0, num_points=None):
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def evaluate_model(model, data, use_delta_encoding, encoder, sample_rate=19531, epoch=0):
     compression_ratios = []
     identical_count = 0
     all_deltas = []
 
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.to(device)
-
+    model.eval()
     for file_data in data:
         file_data = torch.tensor(file_data, dtype=torch.float32).unsqueeze(1).to(device)
-        encoded_data = model(file_data).squeeze(1).cpu().detach().numpy().tolist()
+        encoded_data = model.encode(file_data.squeeze(1).cpu().numpy())
         encoder.build_model(encoded_data)
         compressed_data = encoder.encode(encoded_data)
         decompressed_data = encoder.decode(compressed_data, len(encoded_data))
@@ -36,14 +36,14 @@ def evaluate_model(model, data, use_delta_encoding, encoder, sample_rate=19531,
         compression_ratios.append(compression_ratio)
         
         # Compute and collect deltas
-        predicted_data = model(torch.tensor(encoded_data, dtype=torch.float32).unsqueeze(1).to(device)).squeeze(1).cpu().detach().numpy().tolist()
+        predicted_data = model.decode(encoded_data)
         if use_delta_encoding:
             predicted_data = delta_decode(predicted_data)
         delta_data = [file_data[i].item() - predicted_data[i] for i in range(len(file_data))]
         all_deltas.extend(delta_data)
         
         # Visualize prediction vs data vs error
-        visualize_prediction(file_data.cpu().numpy(), predicted_data, delta_data, sample_rate, num_points)
+        visualize_prediction(file_data.cpu().numpy(), predicted_data, delta_data, sample_rate)
 
     identical_percentage = (identical_count / len(data)) * 100
     
@@ -53,22 +53,24 @@ def evaluate_model(model, data, use_delta_encoding, encoder, sample_rate=19531,
     
     return compression_ratios, identical_percentage
 
-def train_model(model, train_data, test_data, epochs, batch_size, learning_rate, use_delta_encoding, encoder, eval_freq, save_path, num_points=None):
+def train_model(model, train_data, test_data, epochs, batch_size, learning_rate, use_delta_encoding, encoder, eval_freq, save_path):
     """Train the model."""
     wandb.init(project="wav-compression")
     criterion = nn.MSELoss()
     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
     best_test_score = float('inf')
-
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.to(device)
+    model = model.to(device)
     
     for epoch in range(epochs):
+        model.train()
         total_loss = 0
         random.shuffle(train_data)  # Shuffle data for varied batches
         for i in range(0, len(train_data) - batch_size, batch_size):
-            inputs = torch.tensor(train_data[i:i+batch_size], dtype=torch.float32).unsqueeze(2).to(device)
-            targets = torch.tensor(train_data[i+1:i+batch_size+1], dtype=torch.float32).unsqueeze(2).to(device)
+            batch = train_data[i:i+batch_size]
+            max_len = max(len(seq) for seq in batch)
+            padded_batch = np.array([np.pad(seq, (0, max_len - len(seq))) for seq in batch], dtype=np.float32)
+            inputs = torch.tensor(padded_batch[:, :-1], dtype=torch.float32).unsqueeze(2).to(device)
+            targets = torch.tensor(padded_batch[:, 1:], dtype=torch.float32).unsqueeze(2).to(device)
             outputs = model(inputs)
             loss = criterion(outputs, targets)
             optimizer.zero_grad()
@@ -81,8 +83,8 @@ def train_model(model, train_data, test_data, epochs, batch_size, learning_rate,
         
         if (epoch + 1) % eval_freq == 0:
             # Evaluate on train and test data
-            train_compression_ratios, train_identical_percentage = evaluate_model(model, train_data, use_delta_encoding, encoder, epoch=epoch, num_points=num_points)
-            test_compression_ratios, test_identical_percentage = evaluate_model(model, test_data, use_delta_encoding, encoder, epoch=epoch, num_points=num_points)
+            train_compression_ratios, train_identical_percentage = evaluate_model(model, train_data, use_delta_encoding, encoder, epoch=epoch)
+            test_compression_ratios, test_identical_percentage = evaluate_model(model, test_data, use_delta_encoding, encoder, epoch=epoch)
             
             # Log statistics
             wandb.log({