Chapter 3: Introduction to Deep Learning for Audio

Introduction

Now that we understand audio fundamentals and signal processing, it's time to apply deep learning. This chapter introduces neural network architectures for audio and builds your first audio classification model.

Why Deep Learning for Audio?

Traditional audio processing relied on hand-crafted features and classical machine learning. Deep learning changed the game by:

Automatic feature learning: Networks learn optimal representations
End-to-end training: From raw audio to predictions
Transfer learning: Reuse models trained on large datasets
Scalability: Performance improves with more data

Neural Network Basics for Audio

Audio Data Shapes

import torch
import torch.nn as nn
import torchaudio
import numpy as np
import matplotlib.pyplot as plt

# Common audio input shapes for neural networks
def demonstrate_audio_shapes():
    """Show common tensor shapes for audio in deep learning"""
    
    # Parameters
    batch_size = 32
    n_samples = 16000  # 1 second at 16kHz
    n_mels = 128
    n_frames = 100
    n_mfcc = 13
    
    # Different representations
    shapes = {
        'Waveform': (batch_size, 1, n_samples),
        'Stereo Waveform': (batch_size, 2, n_samples),
        'Spectrogram': (batch_size, 1, 513, n_frames),  # 513 = n_fft//2 + 1
        'Mel Spectrogram': (batch_size, 1, n_mels, n_frames),
        'MFCCs': (batch_size, n_mfcc, n_frames),
        'Augmented Mel': (batch_size, 3, n_mels, n_frames)  # 3-channel like RGB
    }
    
    print("Common Audio Tensor Shapes:")
    print("-" * 50)
    for name, shape in shapes.items():
        print(f"{name:20} {str(shape):30} Size: {np.prod(shape):,}")
    
    return shapes

shapes = demonstrate_audio_shapes()

Your First Audio Neural Network

class SimpleAudioClassifier(nn.Module):
    """A simple CNN for audio classification"""
    
    def __init__(self, n_classes=10):
        super(SimpleAudioClassifier, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Pooling
        self.pool = nn.MaxPool2d(2, 2)
        
        # Batch normalization
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Global average pooling
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        
        # Classification head
        self.fc = nn.Linear(128, n_classes)
        
        # Activation
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        # Input shape: (batch, 1, n_mels, n_frames)
        
        # Conv block 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)
        
        # Conv block 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool(x)
        
        # Conv block 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        
        # Global pooling
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        
        # Classification
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

# Create model and show architecture
model = SimpleAudioClassifier(n_classes=10)
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Building an Audio Classification Pipeline

Data Preparation

class AudioDataset(torch.utils.data.Dataset):
    """Custom dataset for audio classification"""
    
    def __init__(self, audio_paths, labels, sr=16000, duration=1.0, 
                 n_mels=128, augment=False):
        self.audio_paths = audio_paths
        self.labels = labels
        self.sr = sr
        self.duration = duration
        self.n_mels = n_mels
        self.augment = augment
        
        # Transforms
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sr,
            n_fft=1024,
            hop_length=256,
            n_mels=n_mels
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        # Load audio
        waveform, orig_sr = torchaudio.load(self.audio_paths[idx])
        
        # Resample if necessary
        if orig_sr != self.sr:
            resampler = torchaudio.transforms.Resample(orig_sr, self.sr)
            waveform = resampler(waveform)
        
        # Ensure mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Fix duration
        target_samples = int(self.sr * self.duration)
        if waveform.shape[1] > target_samples:
            waveform = waveform[:, :target_samples]
        elif waveform.shape[1] < target_samples:
            padding = target_samples - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        # Augmentation
        if self.augment:
            waveform = self.apply_augmentation(waveform)
        
        # Convert to mel spectrogram
        mel_spec = self.mel_transform(waveform)
        mel_spec = self.amplitude_to_db(mel_spec)
        
        # Normalize
        mel_spec = (mel_spec - mel_spec.mean()) / mel_spec.std()
        
        return mel_spec, self.labels[idx]
    
    def apply_augmentation(self, waveform):
        """Apply random augmentations"""
        # Time shift
        if torch.rand(1) < 0.5:
            shift = int(torch.rand(1) * waveform.shape[1] * 0.1)
            waveform = torch.roll(waveform, shift, dims=1)
        
        # Add noise
        if torch.rand(1) < 0.5:
            noise = torch.randn_like(waveform) * 0.005
            waveform = waveform + noise
        
        # Change volume
        if torch.rand(1) < 0.5:
            volume_factor = 0.5 + torch.rand(1) * 1.0
            waveform = waveform * volume_factor
        
        return waveform

Training Loop

def train_audio_model(model, train_loader, val_loader, n_epochs=10):
    """Train audio classification model"""
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=3, factor=0.5
    )
    
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(n_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += targets.size(0)
            train_correct += predicted.eq(targets).sum().item()
            
            if batch_idx % 10 == 0:
                print(f'Epoch: {epoch+1}/{n_epochs} [{batch_idx}/{len(train_loader)}] '
                      f'Loss: {loss.item():.4f}')
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += targets.size(0)
                val_correct += predicted.eq(targets).sum().item()
        
        # Calculate metrics
        train_loss = train_loss / len(train_loader)
        train_acc = 100. * train_correct / train_total
        val_loss = val_loss / len(val_loader)
        val_acc = 100. * val_correct / val_total
        
        # Update history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        # Update learning rate
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}: '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
              f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
    
    return history

# Visualize training history
def plot_training_history(history):
    """Plot training and validation metrics"""
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Loss
    axes[0].plot(history['train_loss'], label='Train Loss')
    axes[0].plot(history['val_loss'], label='Val Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Training and Validation Loss')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Accuracy
    axes[1].plot(history['train_acc'], label='Train Acc')
    axes[1].plot(history['val_acc'], label='Val Acc')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy (%)')
    axes[1].set_title('Training and Validation Accuracy')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

Advanced Architectures for Audio

1. CNN with Attention

class AttentionCNN(nn.Module):
    """CNN with attention mechanism for audio"""
    
    def __init__(self, n_classes=10):
        super(AttentionCNN, self).__init__()
        
        # Feature extraction
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
        )
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Conv2d(256, 128, 1),
            nn.ReLU(),
            nn.Conv2d(128, 1, 1),
            nn.Sigmoid()
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, n_classes)
        )
    
    def forward(self, x):
        # Extract features
        features = self.features(x)
        
        # Compute attention weights
        attention_weights = self.attention(features)
        
        # Apply attention
        attended_features = features * attention_weights
        
        # Classify
        output = self.classifier(attended_features)
        
        return output, attention_weights

2. Recurrent Networks for Sequential Audio

class AudioRNN(nn.Module):
    """RNN/LSTM for sequential audio processing"""
    
    def __init__(self, input_size=128, hidden_size=256, n_classes=10):
        super(AudioRNN, self).__init__()
        
        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        # Attention over time
        self.attention_weights = nn.Linear(hidden_size * 2, 1)
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, n_classes)
        )
    
    def forward(self, x):
        # x shape: (batch, n_mels, time)
        x = x.squeeze(1).transpose(1, 2)  # (batch, time, n_mels)
        
        # LSTM
        lstm_out, _ = self.lstm(x)  # (batch, time, hidden*2)
        
        # Attention
        attention_scores = self.attention_weights(lstm_out)  # (batch, time, 1)
        attention_scores = torch.softmax(attention_scores, dim=1)
        
        # Weighted sum
        context = torch.sum(lstm_out * attention_scores, dim=1)
        
        # Classify
        output = self.classifier(context)
        
        return output

3. Convolutional Recurrent Networks (CRNN)

class CRNN(nn.Module):
    """Convolutional Recurrent Neural Network"""
    
    def __init__(self, n_classes=10):
        super(CRNN, self).__init__()
        
        # CNN feature extractor
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),  # Pool only frequency dimension
            
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),
            
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),
        )
        
        # RNN for temporal modeling
        # After CNN, we'll have features of shape (batch, channels, freq_bins, time)
        self.rnn = nn.GRU(
            input_size=128 * 16,  # channels * remaining freq bins
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )
        
        # Classifier
        self.classifier = nn.Linear(512, n_classes)  # 256 * 2 (bidirectional)
    
    def forward(self, x):
        # CNN features
        cnn_out = self.cnn(x)
        
        # Reshape for RNN: (batch, time, features)
        batch, channels, freq, time = cnn_out.shape
        cnn_out = cnn_out.permute(0, 3, 1, 2)  # (batch, time, channels, freq)
        cnn_out = cnn_out.reshape(batch, time, -1)  # (batch, time, channels*freq)
        
        # RNN
        rnn_out, _ = self.rnn(cnn_out)
        
        # Use last timestep
        output = self.classifier(rnn_out[:, -1, :])
        
        return output

Pre-trained Models for Audio

Using Pre-trained Models

def load_pretrained_audio_model():
    """Load and use pre-trained audio models"""
    
    # Example: Using a pre-trained model from torchaudio
    from torchaudio.models import wav2vec2_base
    
    # Load pre-trained Wav2Vec2
    bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
    model = bundle.get_model()
    
    print(f"Sample Rate: {bundle.sample_rate}")
    print(f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # For classification, we can use the encoder
    class Wav2Vec2Classifier(nn.Module):
        def __init__(self, n_classes=10):
            super().__init__()
            self.wav2vec2 = model
            # Freeze wav2vec2 parameters
            for param in self.wav2vec2.parameters():
                param.requires_grad = False
            
            # Classification head
            self.classifier = nn.Sequential(
                nn.Linear(768, 256),  # wav2vec2 output dim is 768
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(256, n_classes)
            )
        
        def forward(self, x):
            # Extract features
            with torch.no_grad():
                features, _ = self.wav2vec2(x)
            
            # Pool over time
            pooled = features.mean(dim=1)
            
            # Classify
            return self.classifier(pooled)
    
    return Wav2Vec2Classifier()

Best Practices for Audio Deep Learning

1. Data Preprocessing Consistency

class AudioPreprocessor:
    """Consistent preprocessing for all audio"""
    
    def __init__(self, target_sr=16000, target_duration=1.0):
        self.target_sr = target_sr
        self.target_duration = target_duration
        self.target_samples = int(target_sr * target_duration)
    
    def __call__(self, audio_path):
        # Load
        waveform, sr = torchaudio.load(audio_path)
        
        # Resample
        if sr != self.target_sr:
            resampler = torchaudio.transforms.Resample(sr, self.target_sr)
            waveform = resampler(waveform)
        
        # Mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        # Trim or pad
        if waveform.shape[1] > self.target_samples:
            waveform = waveform[:, :self.target_samples]
        else:
            pad_amount = self.target_samples - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_amount))
        
        # Normalize
        waveform = waveform - waveform.mean()
        waveform = waveform / (waveform.std() + 1e-8)
        
        return waveform

2. Model Evaluation

def evaluate_audio_model(model, test_loader, device='cpu'):
    """Comprehensive model evaluation"""
    
    model.eval()
    model.to(device)
    
    all_predictions = []
    all_targets = []
    all_probabilities = []
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # Get predictions
            probabilities = torch.softmax(outputs, dim=1)
            _, predictions = outputs.max(1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(targets.numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    # Calculate metrics
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    from sklearn.metrics import confusion_matrix
    
    accuracy = accuracy_score(all_targets, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_targets, all_predictions, average='weighted'
    )
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(all_targets, all_predictions)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'predictions': all_predictions,
        'probabilities': all_probabilities
    }

Common Pitfalls and Solutions

1. Overfitting

def prevent_overfitting():
    """Techniques to prevent overfitting"""
    
    strategies = {
        'Data Augmentation': {
            'techniques': ['Time stretching', 'Pitch shifting', 'Adding noise'],
            'implementation': 'AudioDataset class with augment=True'
        },
        'Regularization': {
            'techniques': ['Dropout', 'L2 regularization', 'Early stopping'],
            'implementation': 'nn.Dropout(), weight_decay in optimizer'
        },
        'Architecture': {
            'techniques': ['Reduce model size', 'Use batch normalization'],
            'implementation': 'Fewer layers/parameters, nn.BatchNorm2d()'
        },
        'Training': {
            'techniques': ['Learning rate scheduling', 'Gradient clipping'],
            'implementation': 'lr_scheduler, torch.nn.utils.clip_grad_norm_()'
        }
    }
    
    return strategies

2. Class Imbalance

def handle_class_imbalance(train_labels):
    """Handle imbalanced audio datasets"""
    
    from collections import Counter
    from torch.utils.data import WeightedRandomSampler
    
    # Count class frequencies
    class_counts = Counter(train_labels)
    num_classes = len(class_counts)
    
    # Calculate weights
    total_count = sum(class_counts.values())
    class_weights = {
        cls: total_count / (num_classes * count) 
        for cls, count in class_counts.items()
    }
    
    # Create sample weights
    sample_weights = [class_weights[label] for label in train_labels]
    
    # Create weighted sampler
    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=len(sample_weights),
        replacement=True
    )
    
    return sampler

Practical Example: ESC-50 Classification

def complete_audio_classification_example():
    """Complete example with ESC-50 dataset"""
    
    # This is a conceptual example - ESC-50 needs to be downloaded
    
    # 1. Setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 2. Create model
    model = AttentionCNN(n_classes=50)  # ESC-50 has 50 classes
    model.to(device)
    
    # 3. Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # 4. Training configuration
    config = {
        'n_epochs': 50,
        'batch_size': 32,
        'learning_rate': 0.001,
        'sr': 22050,
        'duration': 5.0,
        'n_mels': 128,
        'augment': True
    }
    
    print("Training Configuration:")
    for key, value in config.items():
        print(f"  {key}: {value}")
    
    # 5. Data would be loaded here
    # train_dataset = AudioDataset(...)
    # train_loader = DataLoader(train_dataset, ...)
    
    # 6. Training loop would run here
    # history = train_audio_model(model, train_loader, val_loader)
    
    return model, config

Exercises

Build a Genre Classifier:
- Download GTZAN dataset
- Implement data loading
- Train different architectures
- Compare performance
Implement Data Augmentation:
- Add SpecAugment
- Time and frequency masking
- Measure impact on accuracy
Transfer Learning:
- Use pre-trained model
- Fine-tune on your dataset
- Compare with training from scratch
Attention Visualization:
- Train attention-based model
- Visualize attention weights
- Interpret what model focuses on

Key Takeaways

Start simple: Basic CNNs often work well for audio
Spectrograms are images: Can use computer vision techniques
Augmentation is crucial: Especially with limited data
Pre-training helps: Transfer learning from large datasets
Monitor overfitting: Audio datasets are often small

What's Next?

We now have the foundation for audio deep learning. In the next chapter, we'll dive deep into CNNs for audio classification, exploring advanced architectures and techniques for state-of-the-art performance.

Previous: Chapter 2 - Signal Processing →
Next: Chapter 4 - Audio Classification with CNNs (Coming Soon)