From b5d6eda6bf1ff4936434037683b0329245951937 Mon Sep 17 00:00:00 2001 From: Femke Gelderblom Date: Tue, 2 Apr 2024 12:18:29 +0200 Subject: [PATCH] CHORE: ran black --- BEATs/BEATs.py | 11 +- BEATs_on_ESC50/BEATs/BEATs.py | 89 ++-- BEATs_on_ESC50/BEATs/Tokenizers.py | 93 +++-- BEATs_on_ESC50/BEATs/backbone.py | 228 +++++----- BEATs_on_ESC50/BEATs/modules.py | 15 +- BEATs_on_ESC50/BEATs/quantizer.py | 82 ++-- BEATs_on_ESC50/evaluation/plot_2d_features.py | 10 +- BEATs_on_ESC50/fine_tune/trainer.py | 4 +- BEATs_on_ESC50/fine_tune/transferLearning.py | 14 +- Models/baseline.py | 32 +- Models/pann.py | 391 ++++++++++-------- callbacks/callbacks.py | 13 +- data_utils/audiolist.py | 37 +- dcase_fine_tune/FTBeats.py | 16 +- dcase_fine_tune/FTDataModule.py | 106 ++--- dcase_fine_tune/FTevaluate.py | 105 +++-- dcase_fine_tune/FTtrain.py | 58 +-- dcase_fine_tune/_utils.py | 16 +- evaluate/_utils_compute.py | 47 ++- evaluate/_utils_writing.py | 51 ++- evaluate/evaluation_metrics/evaluation.py | 363 ++++++++++------ .../evaluation_confidence_intervals.py | 375 +++++++++++------ evaluate/evaluation_metrics/metrics.py | 2 +- .../data_utils/miniESC50.py | 14 +- .../BEATs_on_miniECS50/evaluate.py | 84 ++-- .../BEATs_on_miniECS50/miniECS50DataModule.py | 84 ++-- prototypicalbeats/prototraining.py | 112 +++-- 27 files changed, 1530 insertions(+), 922 deletions(-) diff --git a/BEATs/BEATs.py b/BEATs/BEATs.py index 208c87a..a59483f 100644 --- a/BEATs/BEATs.py +++ b/BEATs/BEATs.py @@ -154,12 +154,12 @@ def preprocess( fbank = torch.stack(fbanks, dim=0) fbank = (fbank - fbank_mean) / (2 * fbank_std) return fbank - + def specaugment(self, fbank, specaugment_params): # FBG: Add spectral masking if torch.rand(1) < specaugment_params["application_ratio"]: masking = ta_transforms.TimeMasking( - time_mask_param=specaugment_params["time_mask"], + time_mask_param=specaugment_params["time_mask"], ) fbank = masking(fbank) masking = ta_transforms.FrequencyMasking( @@ -183,8 +183,11 @@ def extract_features( fbank = source.unsqueeze(1) - # FBG: add spectral masking - if hasattr(self.cfg, "specaugment_params") and not self.cfg.specaugment_params is None: + # FBG: add spectral masking + if ( + hasattr(self.cfg, "specaugment_params") + and not self.cfg.specaugment_params is None + ): fbank = self.specaugment(fbank, self.cfg.specaugment_params) # end NOTE FBG features = self.patch_embedding(fbank) diff --git a/BEATs_on_ESC50/BEATs/BEATs.py b/BEATs_on_ESC50/BEATs/BEATs.py index 4f0afa3..dbcab2e 100644 --- a/BEATs_on_ESC50/BEATs/BEATs.py +++ b/BEATs_on_ESC50/BEATs/BEATs.py @@ -35,25 +35,41 @@ def __init__(self, cfg=None): self.encoder_attention_heads: int = 12 # num encoder attention heads self.activation_fn: str = "gelu" # activation function to use - self.layer_wise_gradient_decay_ratio: float = 1.0 # ratio for layer-wise gradient decay + self.layer_wise_gradient_decay_ratio: float = ( + 1.0 # ratio for layer-wise gradient decay + ) self.layer_norm_first: bool = False # apply layernorm first in the transformer self.deep_norm: bool = False # apply deep_norm first in the transformer # dropouts self.dropout: float = 0.1 # dropout probability for the transformer self.attention_dropout: float = 0.1 # dropout probability for attention weights - self.activation_dropout: float = 0.0 # dropout probability after activation in FFN - self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer - self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + self.activation_dropout: float = ( + 0.0 # dropout probability after activation in FFN + ) + self.encoder_layerdrop: float = ( + 0.0 # probability of dropping a tarnsformer layer + ) + self.dropout_input: float = ( + 0.0 # dropout to apply to the input (after feat extr) + ) # positional embeddings - self.conv_pos: int = 128 # number of filters for convolutional positional embeddings - self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + self.conv_pos: int = ( + 128 # number of filters for convolutional positional embeddings + ) + self.conv_pos_groups: int = ( + 16 # number of groups for convolutional positional embedding + ) # relative position embedding - self.relative_position_embedding: bool = False # apply relative position embedding + self.relative_position_embedding: bool = ( + False # apply relative position embedding + ) self.num_buckets: int = 320 # number of buckets for relative position embedding - self.max_distance: int = 1280 # maximum distance for relative position embedding + self.max_distance: int = ( + 1280 # maximum distance for relative position embedding + ) self.gru_rel_pos: bool = False # apply gated relative position embedding # label predictor @@ -70,8 +86,8 @@ def update(self, cfg: dict): class BEATs(nn.Module): def __init__( - self, - cfg: BEATsConfig, + self, + cfg: BEATsConfig, ) -> None: super().__init__() logger.info(f"BEATs Config: {cfg.__dict__}") @@ -86,8 +102,13 @@ def __init__( ) self.input_patch_size = cfg.input_patch_size - self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, - bias=cfg.conv_bias) + self.patch_embedding = nn.Conv2d( + 1, + self.embed, + kernel_size=self.input_patch_size, + stride=self.input_patch_size, + bias=cfg.conv_bias, + ) self.dropout_input = nn.Dropout(cfg.dropout_input) @@ -102,40 +123,44 @@ def __init__( self.predictor = None def forward_padding_mask( - self, - features: torch.Tensor, - padding_mask: torch.Tensor, + self, + features: torch.Tensor, + padding_mask: torch.Tensor, ) -> torch.Tensor: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] - padding_mask = padding_mask.view( - padding_mask.size(0), features.size(1), -1 - ) + padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) return padding_mask def preprocess( - self, - source: torch.Tensor, - fbank_mean: float = 15.41663, - fbank_std: float = 6.55582, + self, + source: torch.Tensor, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, ) -> torch.Tensor: fbanks = [] for waveform in source: - waveform = waveform.unsqueeze(0) * 2 ** 15 - fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10) + waveform = waveform.unsqueeze(0) * 2**15 + fbank = ta_kaldi.fbank( + waveform, + num_mel_bins=128, + sample_frequency=16000, + frame_length=25, + frame_shift=10, + ) fbanks.append(fbank) fbank = torch.stack(fbanks, dim=0) fbank = (fbank - fbank_mean) / (2 * fbank_std) return fbank def extract_features( - self, - source: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, - fbank_mean: float = 15.41663, - fbank_std: float = 6.55582, + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, ): fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std) @@ -168,7 +193,9 @@ def extract_features( if padding_mask is not None and padding_mask.any(): logits[padding_mask] = 0 logits = logits.sum(dim=1) - logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits) + logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as( + logits + ) else: logits = logits.mean(dim=1) @@ -176,4 +203,4 @@ def extract_features( return lprobs, padding_mask else: - return x, padding_mask \ No newline at end of file + return x, padding_mask diff --git a/BEATs_on_ESC50/BEATs/Tokenizers.py b/BEATs_on_ESC50/BEATs/Tokenizers.py index ed3eb2f..37431e9 100644 --- a/BEATs_on_ESC50/BEATs/Tokenizers.py +++ b/BEATs_on_ESC50/BEATs/Tokenizers.py @@ -44,23 +44,37 @@ def __init__(self, cfg=None): # dropouts self.dropout: float = 0.1 # dropout probability for the transformer self.attention_dropout: float = 0.1 # dropout probability for attention weights - self.activation_dropout: float = 0.0 # dropout probability after activation in FFN - self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer - self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + self.activation_dropout: float = ( + 0.0 # dropout probability after activation in FFN + ) + self.encoder_layerdrop: float = ( + 0.0 # probability of dropping a tarnsformer layer + ) + self.dropout_input: float = ( + 0.0 # dropout to apply to the input (after feat extr) + ) # positional embeddings - self.conv_pos: int = 128 # number of filters for convolutional positional embeddings - self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + self.conv_pos: int = ( + 128 # number of filters for convolutional positional embeddings + ) + self.conv_pos_groups: int = ( + 16 # number of groups for convolutional positional embedding + ) # relative position embedding - self.relative_position_embedding: bool = False # apply relative position embedding + self.relative_position_embedding: bool = ( + False # apply relative position embedding + ) self.num_buckets: int = 320 # number of buckets for relative position embedding - self.max_distance: int = 1280 # maximum distance for relative position embedding + self.max_distance: int = ( + 1280 # maximum distance for relative position embedding + ) self.gru_rel_pos: bool = False # apply gated relative position embedding # quantizer - self.quant_n: int = 1024 # codebook number in quantizer - self.quant_dim: int = 256 # codebook dimension in quantizer + self.quant_n: int = 1024 # codebook number in quantizer + self.quant_dim: int = 256 # codebook dimension in quantizer if cfg is not None: self.update(cfg) @@ -71,8 +85,8 @@ def update(self, cfg: dict): class Tokenizers(nn.Module): def __init__( - self, - cfg: TokenizersConfig, + self, + cfg: TokenizersConfig, ) -> None: super().__init__() logger.info(f"Tokenizers Config: {cfg.__dict__}") @@ -87,8 +101,13 @@ def __init__( ) self.input_patch_size = cfg.input_patch_size - self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, - bias=cfg.conv_bias) + self.patch_embedding = nn.Conv2d( + 1, + self.embed, + kernel_size=self.input_patch_size, + stride=self.input_patch_size, + bias=cfg.conv_bias, + ) self.dropout_input = nn.Dropout(cfg.dropout_input) @@ -97,50 +116,58 @@ def __init__( self.layer_norm = LayerNorm(self.embed) self.quantize = NormEMAVectorQuantizer( - n_embed=cfg.quant_n, embedding_dim=cfg.quant_dim, beta=1.0, kmeans_init=True, decay=0.99, + n_embed=cfg.quant_n, + embedding_dim=cfg.quant_dim, + beta=1.0, + kmeans_init=True, + decay=0.99, ) self.quant_n = cfg.quant_n self.quantize_layer = nn.Sequential( nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim), nn.Tanh(), - nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim) # for quantize + nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim), # for quantize ) def forward_padding_mask( - self, - features: torch.Tensor, - padding_mask: torch.Tensor, + self, + features: torch.Tensor, + padding_mask: torch.Tensor, ) -> torch.Tensor: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] - padding_mask = padding_mask.view( - padding_mask.size(0), features.size(1), -1 - ) + padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) return padding_mask def preprocess( - self, - source: torch.Tensor, - fbank_mean: float = 15.41663, - fbank_std: float = 6.55582, + self, + source: torch.Tensor, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, ) -> torch.Tensor: fbanks = [] for waveform in source: - waveform = waveform.unsqueeze(0) * 2 ** 15 - fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10) + waveform = waveform.unsqueeze(0) * 2**15 + fbank = ta_kaldi.fbank( + waveform, + num_mel_bins=128, + sample_frequency=16000, + frame_length=25, + frame_shift=10, + ) fbanks.append(fbank) fbank = torch.stack(fbanks, dim=0) fbank = (fbank - fbank_mean) / (2 * fbank_std) return fbank def extract_labels( - self, - source: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, - fbank_mean: float = 15.41663, - fbank_std: float = 6.55582, + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, ): fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std) @@ -169,4 +196,4 @@ def extract_labels( quantize_input = self.quantize_layer(x) quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input) - return embed_ind \ No newline at end of file + return embed_ind diff --git a/BEATs_on_ESC50/BEATs/backbone.py b/BEATs_on_ESC50/BEATs/backbone.py index f0eadfc..5189046 100644 --- a/BEATs_on_ESC50/BEATs/backbone.py +++ b/BEATs_on_ESC50/BEATs/backbone.py @@ -78,7 +78,9 @@ def __init__(self, args): if self.relative_position_embedding: for i in range(1, args.encoder_layers): del self.layers[i].self_attn.relative_attention_bias - self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias + self.layers[i].self_attn.relative_attention_bias = self.layers[ + 0 + ].self_attn.relative_attention_bias self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) @@ -90,13 +92,19 @@ def __init__(self, args): deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4) for i in range(args.encoder_layers): nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1) - nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta) + nn.init.xavier_normal_( + self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta + ) nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1) - nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta) + nn.init.xavier_normal_( + self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta + ) nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta) nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta) - self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1) + self.layer_wise_gradient_decay_ratio = getattr( + args, "layer_wise_gradient_decay_ratio", 1 + ) def forward(self, x, padding_mask=None, layer=None): x, layer_results = self.extract_features(x, padding_mask, layer) @@ -107,7 +115,6 @@ def forward(self, x, padding_mask=None, layer=None): return x, layer_results def extract_features(self, x, padding_mask=None, tgt_layer=None): - if padding_mask is not None: x[padding_mask] = 0 @@ -136,7 +143,12 @@ def extract_features(self, x, padding_mask=None, tgt_layer=None): x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio) dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): - x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias) + x, z, pos_bias = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + pos_bias=pos_bias, + ) if tgt_layer is not None: layer_results.append((x, z)) if i == tgt_layer: @@ -154,24 +166,23 @@ def extract_features(self, x, padding_mask=None, tgt_layer=None): class TransformerSentenceEncoderLayer(nn.Module): def __init__( - self, - embedding_dim: float = 768, - ffn_embedding_dim: float = 3072, - num_attention_heads: float = 8, - dropout: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - activation_fn: str = "relu", - layer_norm_first: bool = False, - deep_norm: bool = False, - has_relative_attention_bias: bool = False, - num_buckets: int = 0, - max_distance: int = 0, - rescale_init: bool = False, - gru_rel_pos: bool = False, - encoder_layers: int = 0, + self, + embedding_dim: float = 768, + ffn_embedding_dim: float = 3072, + num_attention_heads: float = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + layer_norm_first: bool = False, + deep_norm: bool = False, + has_relative_attention_bias: bool = False, + num_buckets: int = 0, + max_distance: int = 0, + rescale_init: bool = False, + gru_rel_pos: bool = False, + encoder_layers: int = 0, ) -> None: - super().__init__() self.embedding_dim = embedding_dim self.dropout = dropout @@ -214,12 +225,12 @@ def __init__( self.deep_norm_alpha = 1 def forward( - self, - x: torch.Tensor, - self_attn_mask: torch.Tensor = None, - self_attn_padding_mask: torch.Tensor = None, - need_weights: bool = False, - pos_bias=None + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + pos_bias=None, ): residual = x @@ -232,7 +243,7 @@ def forward( key_padding_mask=self_attn_padding_mask, need_weights=False, attn_mask=self_attn_mask, - position_bias=pos_bias + position_bias=pos_bias, ) x = self.dropout1(x) x = residual + x @@ -255,7 +266,7 @@ def forward( key_padding_mask=self_attn_padding_mask, need_weights=need_weights, attn_mask=self_attn_mask, - position_bias=pos_bias + position_bias=pos_bias, ) x = self.dropout1(x) @@ -284,24 +295,24 @@ class MultiheadAttention(nn.Module): """ def __init__( - self, - embed_dim, - num_heads, - kdim=None, - vdim=None, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - self_attention=False, - encoder_decoder_attention=False, - q_noise=0.0, - qn_block_size=8, - has_relative_attention_bias=False, - num_buckets=32, - max_distance=128, - gru_rel_pos=False, - rescale_init=False, + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + q_noise=0.0, + qn_block_size=8, + has_relative_attention_bias=False, + num_buckets=32, + max_distance=128, + gru_rel_pos=False, + rescale_init=False, ): super().__init__() self.embed_dim = embed_dim @@ -322,9 +333,9 @@ def __init__( self.q_head_dim = self.head_dim self.k_head_dim = self.head_dim assert ( - self.head_dim * num_heads == self.embed_dim + self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention @@ -401,21 +412,26 @@ def _relative_positions_bucket(self, relative_positions, bidirectional=True): relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets relative_positions = torch.abs(relative_positions) else: - relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions)) + relative_positions = -torch.min( + relative_positions, torch.zeros_like(relative_positions) + ) max_exact = num_buckets // 2 is_small = relative_positions < max_exact relative_postion_if_large = max_exact + ( - torch.log(relative_positions.float() / max_exact) - / math.log(max_distance / max_exact) - * (num_buckets - max_exact) + torch.log(relative_positions.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) ).to(torch.long) relative_postion_if_large = torch.min( - relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1) + relative_postion_if_large, + torch.full_like(relative_postion_if_large, num_buckets - 1), ) - relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large) + relative_buckets += torch.where( + is_small, relative_positions, relative_postion_if_large + ) return relative_buckets def compute_bias(self, query_length, key_length): @@ -423,27 +439,28 @@ def compute_bias(self, query_length, key_length): memory_position = torch.arange(key_length, dtype=torch.long)[None, :] relative_position = memory_position - context_position relative_position_bucket = self._relative_positions_bucket( - relative_position, - bidirectional=True + relative_position, bidirectional=True + ) + relative_position_bucket = relative_position_bucket.to( + self.relative_attention_bias.weight.device ) - relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device) values = self.relative_attention_bias(relative_position_bucket) values = values.permute([2, 0, 1]) return values def forward( - self, - query, - key: Optional[Tensor], - value: Optional[Tensor], - key_padding_mask: Optional[Tensor] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, - need_weights: bool = True, - static_kv: bool = False, - attn_mask: Optional[Tensor] = None, - before_softmax: bool = False, - need_head_weights: bool = False, - position_bias: Optional[Tensor] = None + self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, + static_kv: bool = False, + attn_mask: Optional[Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + position_bias: Optional[Tensor] = None, ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: """Input shape: Time x Batch x Channel @@ -480,7 +497,11 @@ def forward( if self.has_relative_attention_bias and position_bias is None: position_bias = self.compute_bias(tgt_len, src_len) - position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len) + position_bias = ( + position_bias.unsqueeze(0) + .repeat(bsz, 1, 1, 1) + .view(bsz * self.num_heads, tgt_len, src_len) + ) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) @@ -535,20 +556,20 @@ def forward( q = ( q.contiguous() - .view(tgt_len, bsz * self.num_heads, self.q_head_dim) - .transpose(0, 1) + .view(tgt_len, bsz * self.num_heads, self.q_head_dim) + .transpose(0, 1) ) if k is not None: k = ( k.contiguous() - .view(-1, bsz * self.num_heads, self.k_head_dim) - .transpose(0, 1) + .view(-1, bsz * self.num_heads, self.k_head_dim) + .transpose(0, 1) ) if v is not None: v = ( v.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) - .transpose(0, 1) + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) ) if saved_state is not None: @@ -623,7 +644,9 @@ def forward( ) attn_weights = torch.bmm(q, k.transpose(1, 2)) - attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha + attn_weights = ( + attn_weights - attn_weights.max(dim=-1, keepdim=True)[0] + ) * alpha attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] @@ -652,20 +675,27 @@ def forward( if position_bias is not None: attn_mask_rel_pos = position_bias if self.gru_rel_pos == 1: - query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling + query_layer = ( + q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) + * alpha + / self.scaling + ) _B, _H, _L, __ = query_layer.size() - gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view( - _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1) + gate_a, gate_b = torch.sigmoid( + self.grep_linear(query_layer) + .view(_B, _H, _L, 2, 4) + .sum(-1, keepdim=False) + ).chunk(2, dim=-1) gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0 - attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias + attn_mask_rel_pos = ( + gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias + ) attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size()) attn_weights = attn_weights + attn_mask_rel_pos - attn_weights_float = F.softmax( - attn_weights, dim=-1 - ) + attn_weights_float = F.softmax(attn_weights, dim=-1) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = self.dropout_module(attn_weights) @@ -687,11 +717,11 @@ def forward( @staticmethod def _append_prev_key_padding_mask( - key_padding_mask: Optional[Tensor], - prev_key_padding_mask: Optional[Tensor], - batch_size: int, - src_len: int, - static_kv: bool, + key_padding_mask: Optional[Tensor], + prev_key_padding_mask: Optional[Tensor], + batch_size: int, + src_len: int, + static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: @@ -730,7 +760,7 @@ def _append_prev_key_padding_mask( return new_key_padding_mask def _get_input_buffer( - self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: @@ -740,9 +770,9 @@ def _get_input_buffer( return empty_result def _set_input_buffer( - self, - incremental_state: Dict[str, Dict[str, Optional[Tensor]]], - buffer: Dict[str, Optional[Tensor]], + self, + incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + buffer: Dict[str, Optional[Tensor]], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) @@ -767,9 +797,7 @@ def init_bert_params(module): def normal_(data): # with FSDP, module params will be on CUDA, so we cast them back to CPU # so that the RNG is consistent with and without FSDP - data.copy_( - data.cpu().normal_(mean=0.0, std=0.02).to(data.device) - ) + data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) if isinstance(module, nn.Linear): normal_(module.weight.data) @@ -782,4 +810,4 @@ def normal_(data): if isinstance(module, MultiheadAttention): normal_(module.q_proj.weight.data) normal_(module.k_proj.weight.data) - normal_(module.v_proj.weight.data) \ No newline at end of file + normal_(module.v_proj.weight.data) diff --git a/BEATs_on_ESC50/BEATs/modules.py b/BEATs_on_ESC50/BEATs/modules.py index 8bbf251..2de7429 100644 --- a/BEATs_on_ESC50/BEATs/modules.py +++ b/BEATs_on_ESC50/BEATs/modules.py @@ -75,9 +75,14 @@ def forward(self, x): x = self.linear(x) if self.glu_type == "bilinear": - x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2]) + x = ( + x[:, :, 0 : self.output_dim] + * x[:, :, self.output_dim : self.output_dim * 2] + ) else: - x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2])) + x = x[:, :, 0 : self.output_dim] * self.glu_act( + x[:, :, self.output_dim : self.output_dim * 2] + ) return x @@ -102,9 +107,7 @@ def get_activation_fn(activation: str): elif activation == "gelu": return gelu elif activation == "gelu_fast": - warnings.warn( - "--activation-fn=gelu_fast has been renamed to gelu_accurate" - ) + warnings.warn("--activation-fn=gelu_fast has been renamed to gelu_accurate") return gelu_accurate elif activation == "gelu_accurate": return gelu_accurate @@ -215,4 +218,4 @@ def _forward_pre_hook(mod, input): mod.weight.data = s * weight.masked_fill(mask, 0) module.register_forward_pre_hook(_forward_pre_hook) - return module \ No newline at end of file + return module diff --git a/BEATs_on_ESC50/BEATs/quantizer.py b/BEATs_on_ESC50/BEATs/quantizer.py index 704be4c..dd020b6 100644 --- a/BEATs_on_ESC50/BEATs/quantizer.py +++ b/BEATs_on_ESC50/BEATs/quantizer.py @@ -46,9 +46,10 @@ def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False): if use_cosine_sim: dists = samples @ means.t() else: - diffs = rearrange(samples, 'n d -> n () d') \ - - rearrange(means, 'c d -> () c d') - dists = -(diffs ** 2).sum(dim=-1) + diffs = rearrange(samples, "n d -> n () d") - rearrange( + means, "c d -> () c d" + ) + dists = -(diffs**2).sum(dim=-1) buckets = dists.max(dim=-1).indices bins = torch.bincount(buckets, minlength=num_clusters) @@ -56,7 +57,7 @@ def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False): bins_min_clamped = bins.masked_fill(zero_mask, 1) new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype) - new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d=dim), samples) + new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples) new_means = new_means / bins_min_clamped[..., None] if use_cosine_sim: @@ -68,24 +69,32 @@ def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False): class EmbeddingEMA(nn.Module): - def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): + def __init__( + self, + num_tokens, + codebook_dim, + decay=0.99, + eps=1e-5, + kmeans_init=True, + codebook_init_path="", + ): super().__init__() self.num_tokens = num_tokens self.codebook_dim = codebook_dim self.decay = decay self.eps = eps - if codebook_init_path == '': + if codebook_init_path == "": if not kmeans_init: weight = torch.randn(num_tokens, codebook_dim) weight = l2norm(weight) else: weight = torch.zeros(num_tokens, codebook_dim) - self.register_buffer('initted', torch.Tensor([not kmeans_init])) + self.register_buffer("initted", torch.Tensor([not kmeans_init])) else: print(f"load init codebook weight from {codebook_init_path}") - codebook_ckpt_weight = torch.load(codebook_init_path, map_location='cpu') + codebook_ckpt_weight = torch.load(codebook_init_path, map_location="cpu") weight = codebook_ckpt_weight.clone() - self.register_buffer('initted', torch.Tensor([True])) + self.register_buffer("initted", torch.Tensor([True])) self.weight = nn.Parameter(weight, requires_grad=False) self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False) @@ -107,7 +116,9 @@ def forward(self, embed_id): return F.embedding(embed_id, self.weight) def cluster_size_ema_update(self, new_cluster_size): - self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay) + self.cluster_size.data.mul_(self.decay).add_( + new_cluster_size, alpha=1 - self.decay + ) def embed_avg_ema_update(self, new_embed_avg): self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay) @@ -115,7 +126,7 @@ def embed_avg_ema_update(self, new_embed_avg): def weight_update(self, num_tokens): n = self.cluster_size.sum() smoothed_cluster_size = ( - (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n + (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n ) # normalize embedding average with smoothed cluster size embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) @@ -129,8 +140,17 @@ def norm_ema_inplace(moving_avg, new, decay): class NormEMAVectorQuantizer(nn.Module): - def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, - statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): + def __init__( + self, + n_embed, + embedding_dim, + beta, + decay=0.99, + eps=1e-5, + statistic_code_usage=True, + kmeans_init=False, + codebook_init_path="", + ): super().__init__() self.codebook_dim = embedding_dim self.num_tokens = n_embed @@ -138,20 +158,29 @@ def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, self.decay = decay # learnable = True if orthogonal_reg_weight > 0 else False - self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) + self.embedding = EmbeddingEMA( + self.num_tokens, + self.codebook_dim, + decay, + eps, + kmeans_init, + codebook_init_path, + ) self.statistic_code_usage = statistic_code_usage if statistic_code_usage: - self.register_buffer('cluster_size', torch.zeros(n_embed)) + self.register_buffer("cluster_size", torch.zeros(n_embed)) if distributed.is_available() and distributed.is_initialized(): - print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") + print( + "ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!" + ) self.all_reduce_fn = distributed.all_reduce else: self.all_reduce_fn = nn.Identity() def reset_cluster_size(self, device): if self.statistic_code_usage: - self.register_buffer('cluster_size', torch.zeros(self.num_tokens)) + self.register_buffer("cluster_size", torch.zeros(self.num_tokens)) self.cluster_size = self.cluster_size.to(device) def forward(self, z): @@ -164,9 +193,11 @@ def forward(self, z): self.embedding.init_embed_(z_flattened) - d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \ - self.embedding.weight.pow(2).sum(dim=1) - 2 * \ - torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' + d = ( + z_flattened.pow(2).sum(dim=1, keepdim=True) + + self.embedding.weight.pow(2).sum(dim=1) + - 2 * torch.einsum("bd,nd->bn", z_flattened, self.embedding.weight) + ) # 'n d -> d n' encoding_indices = torch.argmin(d, dim=1) @@ -189,8 +220,8 @@ def forward(self, z): # self.embedding.cluster_size_ema_update(bins) ema_inplace(self.cluster_size, bins, self.decay) - zero_mask = (bins == 0) - bins = bins.masked_fill(zero_mask, 1.) + zero_mask = bins == 0 + bins = bins.masked_fill(zero_mask, 1.0) embed_sum = z_flattened.t() @ encodings self.all_reduce_fn(embed_sum) @@ -198,8 +229,9 @@ def forward(self, z): embed_normalized = (embed_sum / bins.unsqueeze(0)).t() embed_normalized = l2norm(embed_normalized) - embed_normalized = torch.where(zero_mask[..., None], self.embedding.weight, - embed_normalized) + embed_normalized = torch.where( + zero_mask[..., None], self.embedding.weight, embed_normalized + ) norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) # compute loss for embedding @@ -212,4 +244,4 @@ def forward(self, z): # z_q, 'b h w c -> b c h w' # z_q = rearrange(z_q, 'b h w c -> b c h w') # z_q = z_q.transpose(1, 2) - return z_q, loss, encoding_indices \ No newline at end of file + return z_q, loss, encoding_indices diff --git a/BEATs_on_ESC50/evaluation/plot_2d_features.py b/BEATs_on_ESC50/evaluation/plot_2d_features.py index b9b0699..48edb10 100755 --- a/BEATs_on_ESC50/evaluation/plot_2d_features.py +++ b/BEATs_on_ESC50/evaluation/plot_2d_features.py @@ -73,12 +73,17 @@ def loadBEATs(ft_model): else: print("Using the fine tuned model to get the representations") ft = torch.load(ft_model) - ft["state_dict"] = {key.replace('beats.', ''): value for key, value in ft["state_dict"].items() if not key.endswith(('fc.weight', 'fc.bias'))} + ft["state_dict"] = { + key.replace("beats.", ""): value + for key, value in ft["state_dict"].items() + if not key.endswith(("fc.weight", "fc.bias")) + } BEATs_model.load_state_dict(ft["state_dict"]) BEATs_model.eval() return BEATs_model + def extractFeatures(BEATs_model, trs): for t in trs: padding_mask = torch.zeros(t.shape[0], t.shape[1]).bool() @@ -86,6 +91,7 @@ def extractFeatures(BEATs_model, trs): representation = representation[:, -1, :] yield representation.detach().numpy() + def get_2d_features(features, perplexity): representation = np.concatenate(np.array(list(features)), axis=0) tsne = TSNE(n_components=2, perplexity=perplexity) @@ -192,4 +198,4 @@ def main(afiles, labels, model_path, fig_name, perplexity): cli_args.ft_model, cli_args.fig_name, cli_args.perplexity, - ) \ No newline at end of file + ) diff --git a/BEATs_on_ESC50/fine_tune/trainer.py b/BEATs_on_ESC50/fine_tune/trainer.py index 7ebcc11..b7ae8e3 100755 --- a/BEATs_on_ESC50/fine_tune/trainer.py +++ b/BEATs_on_ESC50/fine_tune/trainer.py @@ -21,11 +21,13 @@ def add_arguments_to_parser(self, parser): } ) + def cli_main(): MyLightningCLI( BEATsTransferLearningModel, ECS50DataModule, seed_everything_default=42 ) + if __name__ == "__main__": cli_lightning_logo() - cli_main() \ No newline at end of file + cli_main() diff --git a/BEATs_on_ESC50/fine_tune/transferLearning.py b/BEATs_on_ESC50/fine_tune/transferLearning.py index a0b5555..67aa824 100755 --- a/BEATs_on_ESC50/fine_tune/transferLearning.py +++ b/BEATs_on_ESC50/fine_tune/transferLearning.py @@ -23,7 +23,7 @@ def __init__( lr_scheduler_gamma: float = 1e-1, num_workers: int = 6, model_path: str = "/model/BEATs_iter3_plus_AS2M.pt", - ft_entire_network: bool = False, # Boolean on whether the classifier layer + BEATs should be fine-tuned + ft_entire_network: bool = False, # Boolean on whether the classifier layer + BEATs should be fine-tuned **kwargs, ) -> None: """TransferLearningModel. @@ -123,13 +123,13 @@ def configure_optimizers(self): if self.ft_entire_network: optimizer = optim.AdamW( [{"params": self.beats.parameters()}, {"params": self.fc.parameters()}], - lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 - ) + lr=self.lr, + betas=(0.9, 0.98), + weight_decay=0.01, + ) else: optimizer = optim.AdamW( - self.fc.parameters(), - lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 - ) - + self.fc.parameters(), lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 + ) return optimizer diff --git a/Models/baseline.py b/Models/baseline.py index 34392c4..b34bbf9 100644 --- a/Models/baseline.py +++ b/Models/baseline.py @@ -3,31 +3,33 @@ import torch from collections import OrderedDict -def conv_block(in_channels,out_channels): +def conv_block(in_channels, out_channels): return nn.Sequential( - nn.Conv2d(in_channels,out_channels,3,padding=1), + nn.Conv2d(in_channels, out_channels, 3, padding=1), nn.BatchNorm2d(out_channels), nn.ReLU(), - nn.MaxPool2d(2) + nn.MaxPool2d(2), ) + class ProtoNet(nn.Module): def __init__(self): - super(ProtoNet,self).__init__() + super(ProtoNet, self).__init__() self.encoder = nn.Sequential( - conv_block(1,64), - conv_block(64,64), - conv_block(64,64), - conv_block(64,64) + conv_block(1, 64), + conv_block(64, 64), + conv_block(64, 64), + conv_block(64, 64), ) - def forward(self,x): - (num_samples,seq_len,mel_bins) = x.shape - x = x.view(-1,1,seq_len,mel_bins) + + def forward(self, x): + (num_samples, seq_len, mel_bins) = x.shape + x = x.view(-1, 1, seq_len, mel_bins) x = self.encoder(x) x = nn.MaxPool2d(2)(x) - - return x.view(x.size(0),-1) - + + return x.view(x.size(0), -1) + def extract_features(self, x, padding_mask=None): - return self.forward(x) \ No newline at end of file + return self.forward(x) diff --git a/Models/pann.py b/Models/pann.py index 8079fda..38573ae 100644 --- a/Models/pann.py +++ b/Models/pann.py @@ -6,124 +6,144 @@ from torchlibrosa.stft import Spectrogram, LogmelFilterBank from torchlibrosa.augmentation import SpecAugmentation -#from pytorch_utils import do_mixup, interpolate, pad_framewise_output - +# from pytorch_utils import do_mixup, interpolate, pad_framewise_output + def init_layer(layer): - """Initialize a Linear or Convolutional layer. """ + """Initialize a Linear or Convolutional layer.""" nn.init.xavier_uniform_(layer.weight) - - if hasattr(layer, 'bias'): + + if hasattr(layer, "bias"): if layer.bias is not None: - layer.bias.data.fill_(0.) - - + layer.bias.data.fill_(0.0) + + def init_bn(bn): - """Initialize a Batchnorm layer. """ - bn.bias.data.fill_(0.) - bn.weight.data.fill_(1.) + """Initialize a Batchnorm layer.""" + bn.bias.data.fill_(0.0) + bn.weight.data.fill_(1.0) class ConvBlock(nn.Module): def __init__(self, in_channels, out_channels): - super(ConvBlock, self).__init__() - - self.conv1 = nn.Conv2d(in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), stride=(1, 1), - padding=(1, 1), bias=False) - - self.conv2 = nn.Conv2d(in_channels=out_channels, - out_channels=out_channels, - kernel_size=(3, 3), stride=(1, 1), - padding=(1, 1), bias=False) - + + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ) + + self.conv2 = nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ) + self.bn1 = nn.BatchNorm2d(out_channels) self.bn2 = nn.BatchNorm2d(out_channels) self.init_weight() - + def init_weight(self): init_layer(self.conv1) init_layer(self.conv2) init_bn(self.bn1) init_bn(self.bn2) - - def forward(self, input, pool_size=(2, 2), pool_type='avg'): - + def forward(self, input, pool_size=(2, 2), pool_type="avg"): x = input x = F.relu_(self.bn1(self.conv1(x))) x = F.relu_(self.bn2(self.conv2(x))) - if pool_type == 'max': + if pool_type == "max": x = F.max_pool2d(x, kernel_size=pool_size) - elif pool_type == 'avg': + elif pool_type == "avg": x = F.avg_pool2d(x, kernel_size=pool_size) - elif pool_type == 'avg+max': + elif pool_type == "avg+max": x1 = F.avg_pool2d(x, kernel_size=pool_size) x2 = F.max_pool2d(x, kernel_size=pool_size) x = x1 + x2 else: - raise Exception('Incorrect argument!') - + raise Exception("Incorrect argument!") + return x class ConvBlock5x5(nn.Module): def __init__(self, in_channels, out_channels): - super(ConvBlock5x5, self).__init__() - - self.conv1 = nn.Conv2d(in_channels=in_channels, - out_channels=out_channels, - kernel_size=(5, 5), stride=(1, 1), - padding=(2, 2), bias=False) - + + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(5, 5), + stride=(1, 1), + padding=(2, 2), + bias=False, + ) + self.bn1 = nn.BatchNorm2d(out_channels) self.init_weight() - + def init_weight(self): init_layer(self.conv1) init_bn(self.bn1) - - def forward(self, input, pool_size=(2, 2), pool_type='avg'): - + def forward(self, input, pool_size=(2, 2), pool_type="avg"): x = input x = F.relu_(self.bn1(self.conv1(x))) - if pool_type == 'max': + if pool_type == "max": x = F.max_pool2d(x, kernel_size=pool_size) - elif pool_type == 'avg': + elif pool_type == "avg": x = F.avg_pool2d(x, kernel_size=pool_size) - elif pool_type == 'avg+max': + elif pool_type == "avg+max": x1 = F.avg_pool2d(x, kernel_size=pool_size) x2 = F.max_pool2d(x, kernel_size=pool_size) x = x1 + x2 else: - raise Exception('Incorrect argument!') - + raise Exception("Incorrect argument!") + return x class AttBlock(nn.Module): - def __init__(self, n_in, n_out, activation='linear', temperature=1.): + def __init__(self, n_in, n_out, activation="linear", temperature=1.0): super(AttBlock, self).__init__() - + self.activation = activation self.temperature = temperature - self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True) - self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True) - + self.att = nn.Conv1d( + in_channels=n_in, + out_channels=n_out, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.cla = nn.Conv1d( + in_channels=n_in, + out_channels=n_out, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.bn_att = nn.BatchNorm1d(n_out) self.init_weights() - + def init_weights(self): init_layer(self.att) init_layer(self.cla) init_bn(self.bn_att) - + def forward(self, x): # x: (n_samples, n_in, n_time) norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1) @@ -132,42 +152,44 @@ def forward(self, x): return x, norm_att, cla def nonlinear_transform(self, x): - if self.activation == 'linear': + if self.activation == "linear": return x - elif self.activation == 'sigmoid': + elif self.activation == "sigmoid": return torch.sigmoid(x) class Cnn14(nn.Module): - #def __init__(self, sample_rate=1, window_size=1, hop_size=1, mel_bins=1, fmin=1, + # def __init__(self, sample_rate=1, window_size=1, hop_size=1, mel_bins=1, fmin=1, # fmax=1, classes_num=1): - def __init__(self, classes_num=1): + def __init__(self, classes_num=1): super(Cnn14, self).__init__() - #window = 'hann' - #center = True - #pad_mode = 'reflect' - #ref = 1.0 - #amin = 1e-10 - #top_db = None + # window = 'hann' + # center = True + # pad_mode = 'reflect' + # ref = 1.0 + # amin = 1e-10 + # top_db = None # Spectrogram extractor - #self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, - # win_length=window_size, window=window, center=center, pad_mode=pad_mode, + # self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, + # win_length=window_size, window=window, center=center, pad_mode=pad_mode, # freeze_parameters=True) # Logmel feature extractor - #self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, - # n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, + # self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, + # n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, # freeze_parameters=True) # Spec augmenter - #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, + # self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, # freq_drop_width=8, freq_stripes_num=2) - - # OUR INPUT IS A SPECTROGRAM 128x128 SO HERE WE MAXPOOL TO CONFORM TO - # THE INPUT OF PANN - self.pool = nn.MaxPool2d(kernel_size=2, stride=2) # --> (batch_size, n_channels, H=128, W=128) --> (batch_size, n_channels, H=64, W=64) + + # OUR INPUT IS A SPECTROGRAM 128x128 SO HERE WE MAXPOOL TO CONFORM TO + # THE INPUT OF PANN + self.pool = nn.MaxPool2d( + kernel_size=2, stride=2 + ) # --> (batch_size, n_channels, H=128, W=128) --> (batch_size, n_channels, H=64, W=64) self.bn0 = nn.BatchNorm2d(64) @@ -178,66 +200,66 @@ def __init__(self, classes_num=1): self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) - self.fc1 = nn.Linear(2048, 2048, bias=True) # We get an embedding in a 2048 dimension space - #self.fc_audioset = nn.Linear(2048, classes_num, bias=True) - + self.fc1 = nn.Linear( + 2048, 2048, bias=True + ) # We get an embedding in a 2048 dimension space + # self.fc_audioset = nn.Linear(2048, classes_num, bias=True) + self.init_weight() def init_weight(self): init_bn(self.bn0) init_layer(self.fc1) - #init_layer(self.fc_audioset) - + # init_layer(self.fc_audioset) + def forward(self, input, mixup_lambda=None): """ Input: (batch_size, data_length)""" - #x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) --> 1 is the number of channels - #x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) + # x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) --> 1 is the number of channels + # x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) x = input.unsqueeze(1) x = self.pool(x) x = x.transpose(1, 3) x = self.bn0(x) x = x.transpose(1, 3) - - #if self.training: + + # if self.training: # x = self.spec_augmenter(x) # Mixup on spectrogram - #if self.training and mixup_lambda is not None: + # if self.training and mixup_lambda is not None: # x = do_mixup(x, mixup_lambda) - x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg") x = F.dropout(x, p=0.2, training=self.training) - x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg") x = F.dropout(x, p=0.2, training=self.training) - x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg") x = F.dropout(x, p=0.2, training=self.training) - x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg") x = F.dropout(x, p=0.2, training=self.training) - x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg') + x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg") x = F.dropout(x, p=0.2, training=self.training) - x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg') + x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg") x = F.dropout(x, p=0.2, training=self.training) x = torch.mean(x, dim=3) - + (x1, _) = torch.max(x, dim=2) x2 = torch.mean(x, dim=2) x = x1 + x2 x = F.dropout(x, p=0.5, training=self.training) x = F.relu_(self.fc1(x)) embedding = F.dropout(x, p=0.5, training=self.training) - #clipwise_output = torch.sigmoid(self.fc_audioset(x)) - - #output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding} + # clipwise_output = torch.sigmoid(self.fc_audioset(x)) + + # output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding} + + return embedding # output_dict - return embedding # output_dict - def extract_features(self, x, padding_mask=None): return self.forward(x) - - import numpy as np import time import torch @@ -245,9 +267,9 @@ def extract_features(self, x, padding_mask=None): def move_data_to_device(x, device): - if 'float' in str(x.dtype): + if "float" in str(x.dtype): x = torch.Tensor(x) - elif 'int' in str(x.dtype): + elif "int" in str(x.dtype): x = torch.LongTensor(x) else: return x @@ -256,7 +278,7 @@ def move_data_to_device(x, device): def do_mixup(x, mixup_lambda): - """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes + """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes (1, 3, 5, ...). Args: @@ -266,10 +288,12 @@ def do_mixup(x, mixup_lambda): Returns: out: (batch_size, ...) """ - out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \ - x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1) + out = ( + x[0::2].transpose(0, -1) * mixup_lambda[0::2] + + x[1::2].transpose(0, -1) * mixup_lambda[1::2] + ).transpose(0, -1) return out - + def append_to_dict(dict, key, value): if key in dict.keys(): @@ -278,11 +302,10 @@ def append_to_dict(dict, key, value): dict[key] = [value] -def forward(model, generator, return_input=False, - return_target=False): +def forward(model, generator, return_input=False, return_target=False): """Forward data to a model. - - Args: + + Args: model: object generator: object return_input: bool @@ -303,35 +326,47 @@ def forward(model, generator, return_input=False, # Forward data to a model in mini-batches for n, batch_data_dict in enumerate(generator): print(n) - batch_waveform = move_data_to_device(batch_data_dict['waveform'], device) - + batch_waveform = move_data_to_device(batch_data_dict["waveform"], device) + with torch.no_grad(): model.eval() batch_output = model(batch_waveform) - append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name']) + append_to_dict(output_dict, "audio_name", batch_data_dict["audio_name"]) + + append_to_dict( + output_dict, + "clipwise_output", + batch_output["clipwise_output"].data.cpu().numpy(), + ) - append_to_dict(output_dict, 'clipwise_output', - batch_output['clipwise_output'].data.cpu().numpy()) + if "segmentwise_output" in batch_output.keys(): + append_to_dict( + output_dict, + "segmentwise_output", + batch_output["segmentwise_output"].data.cpu().numpy(), + ) - if 'segmentwise_output' in batch_output.keys(): - append_to_dict(output_dict, 'segmentwise_output', - batch_output['segmentwise_output'].data.cpu().numpy()) + if "framewise_output" in batch_output.keys(): + append_to_dict( + output_dict, + "framewise_output", + batch_output["framewise_output"].data.cpu().numpy(), + ) - if 'framewise_output' in batch_output.keys(): - append_to_dict(output_dict, 'framewise_output', - batch_output['framewise_output'].data.cpu().numpy()) - if return_input: - append_to_dict(output_dict, 'waveform', batch_data_dict['waveform']) - + append_to_dict(output_dict, "waveform", batch_data_dict["waveform"]) + if return_target: - if 'target' in batch_data_dict.keys(): - append_to_dict(output_dict, 'target', batch_data_dict['target']) + if "target" in batch_data_dict.keys(): + append_to_dict(output_dict, "target", batch_data_dict["target"]) if n % 10 == 0: - print(' --- Inference time: {:.3f} s / 10 iterations ---'.format( - time.time() - time1)) + print( + " --- Inference time: {:.3f} s / 10 iterations ---".format( + time.time() - time1 + ) + ) time1 = time.time() for key in output_dict.keys(): @@ -341,9 +376,9 @@ def forward(model, generator, return_input=False, def interpolate(x, ratio): - """Interpolate data in time domain. This is used to compensate the + """Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN. - + Args: x: (batch_size, time_steps, classes_num) ratio: int, ratio to interpolate @@ -358,7 +393,7 @@ def interpolate(x, ratio): def pad_framewise_output(framewise_output, frames_num): - """Pad framewise_output to the same length as input frames. The pad value + """Pad framewise_output to the same length as input frames. The pad value is the same as the value of the last frame. Args: @@ -368,7 +403,9 @@ def pad_framewise_output(framewise_output, frames_num): Outputs: output: (batch_size, frames_num, classes_num) """ - pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1) + pad = framewise_output[:, -1:, :].repeat( + 1, frames_num - framewise_output.shape[1], 1 + ) """tensor for padding""" output = torch.cat((framewise_output, pad), dim=1) @@ -382,78 +419,93 @@ def count_parameters(model): def count_flops(model, audio_length): - """Count flops. Code modified from others' implementation. - """ + """Count flops. Code modified from others' implementation.""" multiply_adds = True - list_conv2d=[] + list_conv2d = [] + def conv2d_hook(self, input, output): batch_size, input_channels, input_height, input_width = input[0].size() output_channels, output_height, output_width = output[0].size() - - kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1) + + kernel_ops = ( + self.kernel_size[0] + * self.kernel_size[1] + * (self.in_channels / self.groups) + * (2 if multiply_adds else 1) + ) bias_ops = 1 if self.bias is not None else 0 - + params = output_channels * (kernel_ops + bias_ops) flops = batch_size * params * output_height * output_width - + list_conv2d.append(flops) - list_conv1d=[] + list_conv1d = [] + def conv1d_hook(self, input, output): batch_size, input_channels, input_length = input[0].size() output_channels, output_length = output[0].size() - - kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1) + + kernel_ops = ( + self.kernel_size[0] + * (self.in_channels / self.groups) + * (2 if multiply_adds else 1) + ) bias_ops = 1 if self.bias is not None else 0 - + params = output_channels * (kernel_ops + bias_ops) flops = batch_size * params * output_length - + list_conv1d.append(flops) - - list_linear=[] + + list_linear = [] + def linear_hook(self, input, output): batch_size = input[0].size(0) if input[0].dim() == 2 else 1 - + weight_ops = self.weight.nelement() * (2 if multiply_adds else 1) bias_ops = self.bias.nelement() - + flops = batch_size * (weight_ops + bias_ops) list_linear.append(flops) - - list_bn=[] + + list_bn = [] + def bn_hook(self, input, output): list_bn.append(input[0].nelement() * 2) - - list_relu=[] + + list_relu = [] + def relu_hook(self, input, output): list_relu.append(input[0].nelement() * 2) - - list_pooling2d=[] + + list_pooling2d = [] + def pooling2d_hook(self, input, output): batch_size, input_channels, input_height, input_width = input[0].size() output_channels, output_height, output_width = output[0].size() - + kernel_ops = self.kernel_size * self.kernel_size bias_ops = 0 params = output_channels * (kernel_ops + bias_ops) flops = batch_size * params * output_height * output_width - + list_pooling2d.append(flops) - list_pooling1d=[] + list_pooling1d = [] + def pooling1d_hook(self, input, output): batch_size, input_channels, input_length = input[0].size() output_channels, output_length = output[0].size() - + kernel_ops = self.kernel_size[0] bias_ops = 0 - + params = output_channels * (kernel_ops + bias_ops) flops = batch_size * params * output_length - + list_pooling2d.append(flops) - + def foo(net): childrens = list(net.children()) if not childrens: @@ -472,20 +524,27 @@ def foo(net): elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d): net.register_forward_hook(pooling1d_hook) else: - print('Warning: flop of module {} is not counted!'.format(net)) + print("Warning: flop of module {} is not counted!".format(net)) return for c in childrens: foo(c) # Register hook foo(model) - + device = device = next(model.parameters()).device input = torch.rand(1, audio_length).to(device) out = model(input) - - total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \ - sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d) - - return total_flops \ No newline at end of file + + total_flops = ( + sum(list_conv2d) + + sum(list_conv1d) + + sum(list_linear) + + sum(list_bn) + + sum(list_relu) + + sum(list_pooling2d) + + sum(list_pooling1d) + ) + + return total_flops diff --git a/callbacks/callbacks.py b/callbacks/callbacks.py index dc205c0..aab76c6 100644 --- a/callbacks/callbacks.py +++ b/callbacks/callbacks.py @@ -2,6 +2,7 @@ from torch.optim.optimizer import Optimizer from pytorch_lightning.callbacks.finetuning import BaseFinetuning + class MilestonesFinetuning(BaseFinetuning): def __init__(self, milestones: int = 1): super().__init__() @@ -16,7 +17,7 @@ def freeze_before_training(self, pl_module: pl.LightningModule): print("[INFO] Unfreezing the last layer of the model") last_layer = list(pl_module.model.children())[-1] # If the last layer is a container, unfreeze its last layer - if hasattr(last_layer, 'children') and list(last_layer.children()): + if hasattr(last_layer, "children") and list(last_layer.children()): last_sublayer = list(last_layer.children())[-1] for param in last_sublayer.parameters(): param.requires_grad = True @@ -25,11 +26,11 @@ def freeze_before_training(self, pl_module: pl.LightningModule): param.requires_grad = True def finetune_function( - self, - pl_module: pl.LightningModule, - epoch: int, - optimizer: Optimizer, - opt_idx: int + self, + pl_module: pl.LightningModule, + epoch: int, + optimizer: Optimizer, + opt_idx: int, ): # Unfreeze the entire model at the specified epoch if epoch == self.unfreeze_at_epoch: diff --git a/data_utils/audiolist.py b/data_utils/audiolist.py index fe4612d..4ef428c 100644 --- a/data_utils/audiolist.py +++ b/data_utils/audiolist.py @@ -3,8 +3,8 @@ RANDOM = np.random.RandomState(42) -def noise(sig, shape, amount=None): +def noise(sig, shape, amount=None): # Random noise intensity if amount == None: amount = RANDOM.uniform(0.1, 0.5) @@ -15,44 +15,55 @@ def noise(sig, shape, amount=None): except: noise = np.zeros(shape) - return noise.astype('float32') + return noise.astype("float32") -def splitSignal(sig, rate, seconds, overlap, minlen): +def splitSignal(sig, rate, seconds, overlap, minlen): # Split signal with overlap sig_splits = [] for i in range(0, len(sig), int((seconds - overlap) * rate)): - split = sig[i:i + int(seconds * rate)] + split = sig[i : i + int(seconds * rate)] # End of signal? if len(split) < int(minlen * rate): break - + # Signal chunk too short? if len(split) < int(rate * seconds): - split = np.hstack((split, noise(split, (int(rate * seconds) - len(split)), 0.5))) - + split = np.hstack( + (split, noise(split, (int(rate * seconds) - len(split)), 0.5)) + ) + sig_splits.append(split) return sig_splits -class AudioList(): - def __init__(self, audiofile, length_segments = 3, minlen = 3, overlap = 0, sample_rate=16000): + +class AudioList: + def __init__( + self, audiofile, length_segments=3, minlen=3, overlap=0, sample_rate=16000 + ): self.audiofile = audiofile self.sample_rate = sample_rate self.length_segments = length_segments self.minlen = minlen self.overlap = overlap - + def read_audio(self): sig, sr = librosa.load(self.audiofile, sr=self.sr, mono=True) return sig def split_segment(self, array): - splitted_array = splitSignal(array, rate=self.sample_rate, seconds=self.length_segments, overlap=self.overlap, minlen=self.minlen) + splitted_array = splitSignal( + array, + rate=self.sample_rate, + seconds=self.length_segments, + overlap=self.overlap, + minlen=self.minlen, + ) return splitted_array def get_processed_list(self): - track = self.read_audio(self.audiofile) + track = self.read_audio(self.audiofile) list_divided = self.split_segment(track) - return list_divided \ No newline at end of file + return list_divided diff --git a/dcase_fine_tune/FTBeats.py b/dcase_fine_tune/FTBeats.py index e7f70e1..e27c2df 100644 --- a/dcase_fine_tune/FTBeats.py +++ b/dcase_fine_tune/FTBeats.py @@ -12,6 +12,7 @@ from BEATs.BEATs import BEATs, BEATsConfig + class BEATsTransferLearningModel(pl.LightningModule): def __init__( self, @@ -19,7 +20,7 @@ def __init__( lr: float = 1e-3, lr_scheduler_gamma: float = 1e-1, model_path: str = None, - ft_entire_network: bool = True, # Boolean on whether the classifier layer + BEATs should be fine-tuned + ft_entire_network: bool = True, # Boolean on whether the classifier layer + BEATs should be fine-tuned **kwargs, ) -> None: """TransferLearningModel. @@ -120,12 +121,13 @@ def configure_optimizers(self): if self.ft_entire_network: optimizer = optim.AdamW( [{"params": self.beats.parameters()}, {"params": self.fc.parameters()}], - lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 - ) + lr=self.lr, + betas=(0.9, 0.98), + weight_decay=0.01, + ) else: optimizer = optim.AdamW( - self.fc.parameters(), - lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 - ) + self.fc.parameters(), lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 + ) - return optimizer \ No newline at end of file + return optimizer diff --git a/dcase_fine_tune/FTDataModule.py b/dcase_fine_tune/FTDataModule.py index bf186f3..9e06b6a 100644 --- a/dcase_fine_tune/FTDataModule.py +++ b/dcase_fine_tune/FTDataModule.py @@ -9,6 +9,7 @@ from torch.utils.data import WeightedRandomSampler from torchsampler import ImbalancedDatasetSampler + class TrainAudioDatasetDCASE(Dataset): def __init__( self, @@ -34,21 +35,22 @@ def __getitem__(self, idx): return input_feature, label + class DCASEDataModule(LightningDataModule): def __init__( self, - data_frame= pd.DataFrame, - batch_size = 4, - num_workers = 4, - tensor_length = 128, - test_size = 0, - min_sample_per_category = 5, + data_frame=pd.DataFrame, + batch_size=4, + num_workers=4, + tensor_length=128, + test_size=0, + min_sample_per_category=5, **kwargs ): super().__init__(**kwargs) self.data_frame = data_frame - self.batch_size=batch_size - self.num_workers=num_workers + self.batch_size = batch_size + self.num_workers = num_workers self.tensor_length = tensor_length self.test_size = test_size self.min_sample_per_category = min_sample_per_category @@ -58,18 +60,19 @@ def __init__( def setup(self, stage=None): # load data - self.data_frame["category"] = LabelEncoder().fit_transform(self.data_frame["category"]) + self.data_frame["category"] = LabelEncoder().fit_transform( + self.data_frame["category"] + ) self.complete_dataset = TrainAudioDatasetDCASE(data_frame=self.data_frame) def divide_train_val(self): - # Separate into training and validation set train_indices, validation_indices, _, _ = train_test_split( range(len(self.complete_dataset)), self.complete_dataset.get_labels(), test_size=self.test_size, random_state=1, - stratify=self.data_frame["category"] + stratify=self.data_frame["category"], ) data_frame_train = self.data_frame.loc[train_indices] @@ -77,7 +80,7 @@ def divide_train_val(self): # deal with class imbalance in the training set value_counts = data_frame_train["category"].value_counts() - weight = 1. / value_counts + weight = 1.0 / value_counts samples_weight = np.array([weight[t] for t in data_frame_train["category"]]) samples_weight = torch.from_numpy(samples_weight) samples_weight = samples_weight.double() @@ -99,6 +102,7 @@ def divide_train_val(self): ) def train_dataloader(self): +<<<<<<< Updated upstream train_loader = DataLoader(self.train_set, batch_size=self.batch_size, num_workers=self.num_workers, @@ -106,30 +110,38 @@ def train_dataloader(self): collate_fn=self.collate_fn, sampler=ImbalancedDatasetSampler(self.train_set) #self.sampler ) +======= + train_loader = DataLoader( + self.train_set, + batch_size=self.batch_size, + num_workers=self.num_workers, + pin_memory=False, + collate_fn=self.collate_fn, + sampler=self.sampler, + ) +>>>>>>> Stashed changes return train_loader - + def val_dataloader(self): - val_loader = DataLoader(self.val_set, - batch_size=self.batch_size, - num_workers=self.num_workers, - pin_memory=False, - collate_fn=self.collate_fn) + val_loader = DataLoader( + self.val_set, + batch_size=self.batch_size, + num_workers=self.num_workers, + pin_memory=False, + collate_fn=self.collate_fn, + ) return val_loader def get_label_dict(self): label_dic = self.complete_dataset.get_label_dict() return label_dic - - def collate_fn( - self, input_data - ): + + def collate_fn(self, input_data): true_class_ids = list({x[1] for x in input_data}) new_input = [] for x in input_data: if x[0].shape[1] > self.tensor_length: - rand_start = torch.randint( - 0, x[0].shape[1] - self.tensor_length, (1,) - ) + rand_start = torch.randint(0, x[0].shape[1] - self.tensor_length, (1,)) new_input.append( (x[0][:, rand_start : rand_start + self.tensor_length], x[1]) ) @@ -137,11 +149,9 @@ def collate_fn( new_input.append(x) all_images = torch.cat([x[0].unsqueeze(0) for x in new_input]) - all_labels = (torch.tensor([true_class_ids.index(x[1]) for x in input_data])) + all_labels = torch.tensor([true_class_ids.index(x[1]) for x in input_data]) return (all_images, all_labels) - - class AudioDatasetDCASE(Dataset): @@ -189,17 +199,14 @@ def __getitem__(self, idx): def get_label_dict(self): return self.label_dict -class predictLoader(): + +class predictLoader: def __init__( - self, - data_frame= pd.DataFrame, - batch_size = 1, - num_workers = 4, - tensor_length = 128 + self, data_frame=pd.DataFrame, batch_size=1, num_workers=4, tensor_length=128 ): self.data_frame = data_frame - self.batch_size=batch_size - self.num_workers=num_workers + self.batch_size = batch_size + self.num_workers = num_workers self.tensor_length = tensor_length self.setup() @@ -208,28 +215,24 @@ def setup(self, stage=None): self.complete_dataset = AudioDatasetDCASE( data_frame=self.data_frame, ) - def pred_dataloader(self): - pred_loader = DataLoader(self.complete_dataset, - batch_size=self.batch_size, - num_workers=self.num_workers, - pin_memory=False, - shuffle=True, - collate_fn=self.collate_fn) + pred_loader = DataLoader( + self.complete_dataset, + batch_size=self.batch_size, + num_workers=self.num_workers, + pin_memory=False, + shuffle=True, + collate_fn=self.collate_fn, + ) return pred_loader - - def collate_fn( - self, input_data - ): + def collate_fn(self, input_data): true_class_ids = list({x[1] for x in input_data}) new_input = [] for x in input_data: if x[0].shape[1] > self.tensor_length: - rand_start = torch.randint( - 0, x[0].shape[1] - self.tensor_length, (1,) - ) + rand_start = torch.randint(0, x[0].shape[1] - self.tensor_length, (1,)) new_input.append( (x[0][:, rand_start : rand_start + self.tensor_length], x[1]) ) @@ -237,7 +240,6 @@ def collate_fn( new_input.append(x) all_images = torch.cat([x[0].unsqueeze(0) for x in new_input]) - all_labels = (torch.tensor([true_class_ids.index(x[1]) for x in input_data])) + all_labels = torch.tensor([true_class_ids.index(x[1]) for x in input_data]) return (all_images, all_labels) - diff --git a/dcase_fine_tune/FTevaluate.py b/dcase_fine_tune/FTevaluate.py index 9625e72..0718820 100644 --- a/dcase_fine_tune/FTevaluate.py +++ b/dcase_fine_tune/FTevaluate.py @@ -15,12 +15,24 @@ import torch from dcase_fine_tune.FTBeats import BEATsTransferLearningModel -from dcase_fine_tune.FTDataModule import AudioDatasetDCASE, DCASEDataModule, predictLoader -from dcase_fine_tune._utils import write_wav, write_results, merge_preds, to_dataframe, construct_path, compute_scores +from dcase_fine_tune.FTDataModule import ( + AudioDatasetDCASE, + DCASEDataModule, + predictLoader, +) +from dcase_fine_tune._utils import ( + write_wav, + write_results, + merge_preds, + to_dataframe, + construct_path, + compute_scores, +) import hydra from omegaconf import DictConfig, OmegaConf + def finetune_model( model_path, datamodule_class, @@ -37,10 +49,12 @@ def finetune_model( auto_select_gpus=True, callbacks=[ pl.callbacks.LearningRateMonitor(logging_interval="step"), - pl.callbacks.EarlyStopping(monitor="train_acc", mode="max", patience=max_epochs), + pl.callbacks.EarlyStopping( + monitor="train_acc", mode="max", patience=max_epochs + ), ], default_root_dir="logs/", - enable_checkpointing=False + enable_checkpointing=False, ) # create the model object @@ -51,10 +65,10 @@ def finetune_model( return model + def predict_label(cfg, model, loader, frame_shift): - model = model.to("cuda") - + # Get the embeddings, the beginning and end of the segment! pred_labels = [] labels = [] @@ -72,7 +86,13 @@ def predict_label(cfg, model, loader, frame_shift): begin = i / 1000 end = cfg["data"]["tensor_length"] * frame_shift / 1000 else: - begin = i * cfg["data"]["tensor_length"] * frame_shift * cfg["data"]["overlap"] / 1000 + begin = ( + i + * cfg["data"]["tensor_length"] + * frame_shift + * cfg["data"]["overlap"] + / 1000 + ) end = begin + cfg["data"]["tensor_length"] * frame_shift / 1000 # Get the scores: @@ -95,6 +115,7 @@ def predict_label(cfg, model, loader, frame_shift): return pred_labels, labels, begins, ends + def train_predict( cfg, meta_df, @@ -102,9 +123,8 @@ def train_predict( support_labels, query_spectrograms, query_labels, - target_path="/data" + target_path="/data", ): - # Get the filename and the frame_shift for the particular file filename = os.path.basename(support_spectrograms).split("data_")[1].split(".")[0] frame_shift = meta_df.loc[filename, "frame_shift"] @@ -117,10 +137,12 @@ def train_predict( assert filename in query_labels df_support = to_dataframe(support_spectrograms, support_labels) - supportLoader = DCASEDataModule(data_frame=df_support, - batch_size=cfg["trainer"]["batch_size"], - num_workers=cfg["trainer"]["num_workers"], - tensor_length=cfg["data"]["tensor_length"]) + supportLoader = DCASEDataModule( + data_frame=df_support, + batch_size=cfg["trainer"]["batch_size"], + num_workers=cfg["trainer"]["num_workers"], + tensor_length=cfg["data"]["tensor_length"], + ) label_dic = supportLoader.get_label_dict() @@ -130,9 +152,10 @@ def train_predict( # Train the model with the support data print("[INFO] TRAINING THE MODEL FOR {}".format(filename)) - model = finetune_model(model_path=cfg["model"]["model_path"], - datamodule_class=supportLoader, - max_epochs=cfg["trainer"]["max_epochs"] + model = finetune_model( + model_path=cfg["model"]["model_path"], + datamodule_class=supportLoader, + max_epochs=cfg["trainer"]["max_epochs"], ) ################################# @@ -141,15 +164,16 @@ def train_predict( ### Get the query dataset ### df_query = to_dataframe(query_spectrograms, query_labels) - queryLoader = predictLoader(data_frame=df_query, - batch_size=1, - num_workers=cfg["trainer"]["num_workers"], - tensor_length=cfg["data"]["tensor_length"]).pred_dataloader() - - predicted_labels, labels, begins, ends = predict_label(cfg=cfg, - model=model, - loader=queryLoader, - frame_shift=frame_shift) + queryLoader = predictLoader( + data_frame=df_query, + batch_size=1, + num_workers=cfg["trainer"]["num_workers"], + tensor_length=cfg["data"]["tensor_length"], + ).pred_dataloader() + + predicted_labels, labels, begins, ends = predict_label( + cfg=cfg, model=model, loader=queryLoader, frame_shift=frame_shift + ) ###################### # COMPUTE THE SCORES # @@ -157,7 +181,7 @@ def train_predict( # Compute the scores for the analysed file -- just as information acc, recall, precision, f1score = compute_scores( - predicted_labels=predicted_labels, #updated_labels, + predicted_labels=predicted_labels, # updated_labels, gt_labels=labels, ) with open( @@ -208,16 +232,27 @@ def train_predict( df_result_raw, ) -@hydra.main(version_base=None, config_path="/app/dcase_fine_tune", config_name="CONFIG.yaml") -def main(cfg: DictConfig): +@hydra.main( + version_base=None, config_path="/app/dcase_fine_tune", config_name="CONFIG.yaml" +) +def main(cfg: DictConfig): # Get training config version_path = os.path.dirname(os.path.dirname(cfg["model"]["model_path"])) version_name = os.path.basename(version_path) # Simplify the creation of my_hash_dict using dictionary comprehension - keys = ["resample", "denoise", "normalize", "frame_length", "tensor_length", - "set_type", "overlap", "num_mel_bins", "max_segment_length"] + keys = [ + "resample", + "denoise", + "normalize", + "frame_length", + "tensor_length", + "set_type", + "overlap", + "num_mel_bins", + "max_segment_length", + ] my_hash_dict = {k: cfg["data"][k] for k in keys} # Conditionally add 'target_fs' if 'resample' is True @@ -225,7 +260,9 @@ def main(cfg: DictConfig): my_hash_dict["target_fs"] = cfg["data"]["target_fs"] # Generate hash directory name - hash_dir_name = hashlib.sha1(json.dumps(my_hash_dict, sort_keys=True).encode()).hexdigest() + hash_dir_name = hashlib.sha1( + json.dumps(my_hash_dict, sort_keys=True).encode() + ).hexdigest() # Base directory for data base_data_path = "/data/DCASEfewshot" @@ -284,7 +321,7 @@ def main(cfg: DictConfig): param = deepcopy(cfg) # Convert the DictConfig object to a standard Python dictionary param = OmegaConf.to_container(param, resolve=True) - + with open(os.path.join(target_path, "param.json"), "w") as fp: json.dump(param, fp) @@ -353,7 +390,7 @@ def main(cfg: DictConfig): target_fs=cfg["data"]["target_fs"], target_path=target_path, frame_shift=meta_df.loc[filename, "frame_shift"], - support_spectrograms=support_spectrograms + support_spectrograms=support_spectrograms, ) # Return the final product @@ -376,4 +413,4 @@ def main(cfg: DictConfig): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/dcase_fine_tune/FTtrain.py b/dcase_fine_tune/FTtrain.py index 8c6c256..a705b1b 100644 --- a/dcase_fine_tune/FTtrain.py +++ b/dcase_fine_tune/FTtrain.py @@ -12,13 +12,14 @@ import hydra from omegaconf import DictConfig, OmegaConf + def train_model( model, datamodule_class, max_epochs, patience, num_sanity_val_steps=0, - root_dir="logs/" + root_dir="logs/", ): # create the lightning trainer object trainer = pl.Trainer( @@ -30,10 +31,12 @@ def train_model( auto_select_gpus=True, callbacks=[ pl.callbacks.LearningRateMonitor(logging_interval="step"), - pl.callbacks.EarlyStopping(monitor="train_loss", mode="min", patience=patience), + pl.callbacks.EarlyStopping( + monitor="train_loss", mode="min", patience=patience + ), ], default_root_dir=root_dir, - enable_checkpointing=True + enable_checkpointing=True, ) # train the model @@ -41,8 +44,8 @@ def train_model( return model -def load_data(cfg): +def load_data(cfg): # load right pickle my_hash_dict = { "resample": cfg["data"]["resample"], @@ -60,9 +63,7 @@ def load_data(cfg): hash_dir_name = hashlib.sha1( json.dumps(my_hash_dict, sort_keys=True).encode() ).hexdigest() - target_path = os.path.join( - "/data/DCASEfewshot", "train", hash_dir_name, "audio" - ) + target_path = os.path.join("/data/DCASEfewshot", "train", hash_dir_name, "audio") # load data input_features = np.load(os.path.join(target_path, "data.npz")) labels = np.load(os.path.join(target_path, "labels.npy")) @@ -71,32 +72,41 @@ def load_data(cfg): return data_frame -@hydra.main(version_base=None, config_path="/app/dcase_fine_tune", config_name="CONFIG.yaml") -def main(cfg: DictConfig): +@hydra.main( + version_base=None, config_path="/app/dcase_fine_tune", config_name="CONFIG.yaml" +) +def main(cfg: DictConfig): df = load_data(cfg) # Create the loader model - Loader = DCASEDataModule(data_frame=df, - batch_size=cfg["trainer"]["batch_size"], - num_workers=cfg["trainer"]["num_workers"], - tensor_length=cfg["data"]["tensor_length"], - test_size=cfg["trainer"]["test_size"], - min_sample_per_category=cfg["trainer"]["min_sample_per_category"]) + Loader = DCASEDataModule( + data_frame=df, + batch_size=cfg["trainer"]["batch_size"], + num_workers=cfg["trainer"]["num_workers"], + tensor_length=cfg["data"]["tensor_length"], + test_size=cfg["trainer"]["test_size"], + min_sample_per_category=cfg["trainer"]["min_sample_per_category"], + ) # create the model object num_target_classes = len(df["category"].unique()) - model = BEATsTransferLearningModel(model_path=cfg["model"]["model_path"], - num_target_classes=num_target_classes, - lr=cfg["model"]["lr"], - ft_entire_network=cfg["model"]["ft_entire_network"]) + model = BEATsTransferLearningModel( + model_path=cfg["model"]["model_path"], + num_target_classes=num_target_classes, + lr=cfg["model"]["lr"], + ft_entire_network=cfg["model"]["ft_entire_network"], + ) + + train_model( + model, + Loader, + cfg["trainer"]["max_epochs"], + patience=cfg["trainer"]["patience"], + root_dir=cfg["trainer"]["default_root_dir"], + ) - train_model(model, - Loader, - cfg["trainer"]["max_epochs"], - patience=cfg["trainer"]["patience"], - root_dir=cfg["trainer"]["default_root_dir"]) if __name__ == "__main__": main() diff --git a/dcase_fine_tune/_utils.py b/dcase_fine_tune/_utils.py index be92c8a..bd47b21 100644 --- a/dcase_fine_tune/_utils.py +++ b/dcase_fine_tune/_utils.py @@ -5,6 +5,7 @@ from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score + def write_results(predicted_labels, begins, ends): df_out = pd.DataFrame( { @@ -16,6 +17,7 @@ def write_results(predicted_labels, begins, ends): return df_out + def write_wav( files, cfg, @@ -26,7 +28,7 @@ def write_wav( target_fs=16000, target_path=None, frame_shift=1, - support_spectrograms=None + support_spectrograms=None, ): from scipy.io import wavfile @@ -92,7 +94,7 @@ def write_wav( gt_labels, (0, len(gt_labels) - len(arr)), "constant", constant_values=(0,) ) pred_labels = np.pad( - pred_labels, (0, len(pred_labels) - len(arr) ), "constant", constant_values=(0,) + pred_labels, (0, len(pred_labels) - len(arr)), "constant", constant_values=(0,) ) distances_to_pos = np.pad( distances_to_pos, @@ -102,7 +104,7 @@ def write_wav( ) z_scores_pos = np.pad( z_scores_pos, - (0, len(z_scores_pos) - len(arr)), + (0, len(z_scores_pos) - len(arr)), "constant", constant_values=(0,), ) @@ -113,6 +115,7 @@ def write_wav( ) wavfile.write(output, target_fs, result_wav.T) + def merge_preds(df, tolerence, tensor_length): df["group"] = ( df["Starttime"] > (df["Endtime"] + tolerence * tensor_length).shift().cummax() @@ -120,6 +123,7 @@ def merge_preds(df, tolerence, tensor_length): result = df.groupby("group").agg({"Starttime": "min", "Endtime": "max"}) return result + def to_dataframe(features, labels): """Load the saved array and map the features and labels into a single dataframe""" input_features = np.load(features) @@ -128,6 +132,7 @@ def to_dataframe(features, labels): df = pd.DataFrame({"feature": list_input_features, "category": labels}) return df + def compute_scores(predicted_labels, gt_labels): acc = accuracy_score(gt_labels, predicted_labels) recall = recall_score(gt_labels, predicted_labels) @@ -139,5 +144,8 @@ def compute_scores(predicted_labels, gt_labels): print(f"F1 score: {f1score}") return acc, recall, precision, f1score + def construct_path(base_dir, status, hash_dir_name, file_type, file_pattern): - return os.path.join(base_dir, status, hash_dir_name, "audio", f"{file_type}.{file_pattern}") \ No newline at end of file + return os.path.join( + base_dir, status, hash_dir_name, "audio", f"{file_type}.{file_pattern}" + ) diff --git a/evaluate/_utils_compute.py b/evaluate/_utils_compute.py index 4280efb..c7a0c14 100644 --- a/evaluate/_utils_compute.py +++ b/evaluate/_utils_compute.py @@ -19,8 +19,8 @@ def to_dataframe(features, labels): return df -def get_proto_coordinates(model, model_type, support_data, support_labels, n_way): +def get_proto_coordinates(model, model_type, support_data, support_labels, n_way): if model_type == "beats": z_supports, _ = model.get_embeddings(support_data, padding_mask=None) else: @@ -34,9 +34,11 @@ def get_proto_coordinates(model, model_type, support_data, support_labels, n_way # Return the coordinates of the prototypes and the z_supports return prototypes, z_supports + def euclidean_distance(x1, x2): return torch.sqrt(torch.sum((x1 - x2) ** 2, dim=1)) + def calculate_distance(model_type, z_query, z_proto): # Compute the euclidean distance from queries to prototypes dists = [] @@ -55,6 +57,7 @@ def calculate_distance(model_type, z_query, z_proto): return scores, dists + def compute_scores(predicted_labels, gt_labels): acc = accuracy_score(gt_labels, predicted_labels) recall = recall_score(gt_labels, predicted_labels) @@ -66,12 +69,18 @@ def compute_scores(predicted_labels, gt_labels): print(f"F1 score: {f1score}") return acc, recall, precision, f1score -def merge_preds(df, tolerence, tensor_length,frame_shift): + +def merge_preds(df, tolerence, tensor_length, frame_shift): df["group"] = ( - df["Starttime"] > (df["Endtime"] + tolerence * tensor_length * frame_shift /1000 +0.00001).shift()).cumsum() + df["Starttime"] + > ( + df["Endtime"] + tolerence * tensor_length * frame_shift / 1000 + 0.00001 + ).shift() + ).cumsum() result = df.groupby("group").agg({"Starttime": "min", "Endtime": "max"}) return result + def reshape_support(support_samples, tensor_length=128, n_subsample=1): new_input = [] @@ -86,10 +95,11 @@ def reshape_support(support_samples, tensor_length=128, n_subsample=1): x = torch.tensor(x) x_adjusted = x if x.shape[0] == 1 else x.unsqueeze(0) new_input.append(x_adjusted) - + all_supports = torch.cat(new_input) # Concatenate all tensors in the list return all_supports + def train_model( model_type=None, datamodule_class=DCASEDataModule, @@ -99,7 +109,7 @@ def train_model( seed=42, pretrained_model=None, state=None, - beats_path="/data/model/BEATs/BEATs_iter3_plus_AS2M.pt" + beats_path="/data/model/BEATs/BEATs_iter3_plus_AS2M.pt", ): # create the lightning trainer object trainer = pl.Trainer( @@ -116,13 +126,13 @@ def train_model( ), ], default_root_dir="logs/", - enable_checkpointing=False + enable_checkpointing=False, ) # create the model object - model = ProtoBEATsModel(model_type=model_type, - state=state, - model_path=pretrained_model) + model = ProtoBEATsModel( + model_type=model_type, state=state, model_path=pretrained_model + ) # train the model trainer.fit(model, datamodule=datamodule_class) @@ -130,8 +140,9 @@ def train_model( return model -def training(model_type, pretrained_model, state, custom_datamodule, max_epoch, beats_path): - +def training( + model_type, pretrained_model, state, custom_datamodule, max_epoch, beats_path +): model = train_model( model_type, custom_datamodule, @@ -141,11 +152,12 @@ def training(model_type, pretrained_model, state, custom_datamodule, max_epoch, seed=42, pretrained_model=pretrained_model, state=state, - beats_path=beats_path + beats_path=beats_path, ) return model + def predict_labels_query( model, model_type, @@ -193,14 +205,18 @@ def predict_labels_query( end = begin + tensor_length * frame_shift / 1000 # Get the scores: - classification_scores, dists = calculate_distance(model_type, q_embedding, prototypes) + classification_scores, dists = calculate_distance( + model_type, q_embedding, prototypes + ) if model_type != "beats": dists = dists.squeeze() classification_scores = classification_scores.squeeze() # Get the labels (either POS or NEG): - predicted_labels = torch.max(classification_scores, 0)[pos_index] # The dim where the distance to prototype is stored is 1 + predicted_labels = torch.max(classification_scores, 0)[ + pos_index + ] # The dim where the distance to prototype is stored is 1 distance_to_pos = dists[pos_index].detach().to("cpu").numpy() predicted_labels = predicted_labels.detach().to("cpu").numpy() @@ -222,6 +238,7 @@ def predict_labels_query( return pred_labels, labels, begins, ends, d_to_pos, q_embeddings + def filter_outliers_by_p_values(Y, p_values, target_class=1, upper_threshold=0.05): # Identify indices where the p-value is less than the threshold and the corresponding Y value equals the target_class outlier_indices = np.where((p_values < upper_threshold) & (Y == target_class))[0] @@ -229,4 +246,4 @@ def filter_outliers_by_p_values(Y, p_values, target_class=1, upper_threshold=0.0 # Update labels in the original Y array for identified indices Y[outlier_indices] = 0 - return Y \ No newline at end of file + return Y diff --git a/evaluate/_utils_writing.py b/evaluate/_utils_writing.py index 2925276..9a88141 100644 --- a/evaluate/_utils_writing.py +++ b/evaluate/_utils_writing.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd + def write_results(predicted_labels, begins, ends): df_out = pd.DataFrame( { @@ -14,6 +15,7 @@ def write_results(predicted_labels, begins, ends): return df_out + def write_wav( files, cfg, @@ -26,7 +28,7 @@ def write_wav( frame_shift=1, resample=True, support_spectrograms=None, - result_merged=None + result_merged=None, ): from scipy.io import wavfile @@ -92,9 +94,11 @@ def write_wav( / 1000 ), ) - merged_pred =np.zeros(len(arr)) - for ind, row in result_merged.iterrows(): - merged_pred[int(row["Starttime"]*target_fs):int(row["Endtime"]*target_fs)] = 1 + merged_pred = np.zeros(len(arr)) + for ind, row in result_merged.iterrows(): + merged_pred[ + int(row["Starttime"] * target_fs) : int(row["Endtime"] * target_fs) + ] = 1 # pad with zeros if len(arr) > len(gt_labels): @@ -129,7 +133,7 @@ def write_wav( # Write the results result_wav = np.vstack( - (arr, gt_labels, merged_pred, pred_labels , distances_to_pos / 10, z_scores_pos) + (arr, gt_labels, merged_pred, pred_labels, distances_to_pos / 10, z_scores_pos) ) wavfile.write(output, target_fs, result_wav.T) @@ -161,10 +165,9 @@ def plot_2_d_representation(prototypes, if model_type == "beats": feat = feat[:, -1, :] - all_labels = np.concatenate([prototypes_labels, - pos_supports_labels, - neg_supports_labels, - gt_labels]) + all_labels = np.concatenate( + [prototypes_labels, pos_supports_labels, neg_supports_labels, gt_labels] + ) # Run t-SNE tsne = TSNE(n_components=2, perplexity=perplexity) @@ -176,14 +179,18 @@ def plot_2_d_representation(prototypes, 1: "POS queries", 2: "Prototypes", 3: "POS supports", - 4: "NEG supports" + 4: "NEG supports", } # Figure plt.figure(figsize=(10, 8)) # Define marker for each type of point - markers = {2: "P", 3: "o", 4: "X"} # P for prototypes, o for supports, X for negative supports + markers = { + 2: "P", + 3: "o", + 4: "X", + } # P for prototypes, o for supports, X for negative supports for label in np.unique(all_labels): idx = np.where(all_labels == label) @@ -191,19 +198,21 @@ def plot_2_d_representation(prototypes, size = 150 if label == 2 else 100 if label in markers else 50 alpha = 1.0 if label == 2 else 0.8 if label in markers else 0.25 - plt.scatter(features_2d[idx, 0], - features_2d[idx, 1], - label=label_descriptions[label], - alpha=alpha, - marker=markers.get(label, 'o'), - s=size) + plt.scatter( + features_2d[idx, 0], + features_2d[idx, 1], + label=label_descriptions[label], + alpha=alpha, + marker=markers.get(label, "o"), + s=size, + ) plt.legend() - plt.title('t-SNE visualization of embeddings, prototypes, and supports') - plt.xlabel('Dimension 1') - plt.ylabel('Dimension 2') + plt.title("t-SNE visualization of embeddings, prototypes, and supports") + plt.xlabel("Dimension 1") + plt.ylabel("Dimension 2") plt.grid(True) # Save the figure plt.savefig(output, bbox_inches="tight") - plt.show() \ No newline at end of file + plt.show() diff --git a/evaluate/evaluation_metrics/evaluation.py b/evaluate/evaluation_metrics/evaluation.py index 9f944ee..132e7a2 100644 --- a/evaluate/evaluation_metrics/evaluation.py +++ b/evaluate/evaluation_metrics/evaluation.py @@ -13,30 +13,32 @@ MIN_EVAL_VALUE = 0.00001 N_SHOTS = 5 MIN_IOU_TH = 0.3 -PRED_FILE_HEADER = ["Audiofilename","Starttime","Endtime"] -POS_VALUE = 'POS' -UNK_VALUE = 'UNK' +PRED_FILE_HEADER = ["Audiofilename", "Starttime", "Endtime"] +POS_VALUE = "POS" +UNK_VALUE = "UNK" + def remove_shots_from_ref(ref_df, number_shots=5): - ref_pos_indexes = select_events_with_value(ref_df, value=POS_VALUE) - ref_n_shot_index = ref_pos_indexes[number_shots-1] + ref_n_shot_index = ref_pos_indexes[number_shots - 1] # remove all events (pos and UNK) that happen before this 5th event - events_to_drop = ref_df.index[ref_df['Endtime'] <= ref_df.iloc[ref_n_shot_index]['Endtime']].tolist() + events_to_drop = ref_df.index[ + ref_df["Endtime"] <= ref_df.iloc[ref_n_shot_index]["Endtime"] + ].tolist() return ref_df.drop(events_to_drop) -def select_events_with_value(data_frame, value=POS_VALUE): +def select_events_with_value(data_frame, value=POS_VALUE): indexes_list = data_frame.index[data_frame["Q"] == value].tolist() return indexes_list -def build_matrix_from_selected_rows(data_frame, selected_indexes_list ): - matrix_data = np.ones((2, len(selected_indexes_list)))* -1 +def build_matrix_from_selected_rows(data_frame, selected_indexes_list): + matrix_data = np.ones((2, len(selected_indexes_list))) * -1 for n, idx in enumerate(selected_indexes_list): - matrix_data[0, n] = data_frame.loc[idx].Starttime # start time for event n + matrix_data[0, n] = data_frame.loc[idx].Starttime # start time for event n matrix_data[1, n] = data_frame.loc[idx].Endtime return matrix_data @@ -56,32 +58,33 @@ def compute_tp_fp_fn(pred_events_df, ref_events_df): if "Q" not in pred_events_df.columns: pred_events_df["Q"] = POS_VALUE - #sort events by starttime - pred_events_df = pred_events_df.sort_values(by='Starttime', axis=0, ascending=True) + # sort events by starttime + pred_events_df = pred_events_df.sort_values(by="Starttime", axis=0, ascending=True) pred_pos_indexes = select_events_with_value(pred_events_df, value=POS_VALUE) ref_1st_round = build_matrix_from_selected_rows(ref_events_df, ref_pos_indexes) pred_1st_round = build_matrix_from_selected_rows(pred_events_df, pred_pos_indexes) m_pos = metrics.match_events(ref_1st_round, pred_1st_round, min_iou=MIN_IOU_TH) - matched_ref_indexes = [ri for ri, pi in m_pos] + matched_ref_indexes = [ri for ri, pi in m_pos] matched_pred_indexes = [pi for ri, pi in m_pos] - ref_unk_indexes = select_events_with_value(ref_events_df, value=UNK_VALUE) ref_2nd_round = build_matrix_from_selected_rows(ref_events_df, ref_unk_indexes) - unmatched_pred_events = list(set(range(pred_1st_round.shape[1])) - set(matched_pred_indexes)) + unmatched_pred_events = list( + set(range(pred_1st_round.shape[1])) - set(matched_pred_indexes) + ) pred_2nd_round = pred_1st_round[:, unmatched_pred_events] m_unk = metrics.match_events(ref_2nd_round, pred_2nd_round, min_iou=MIN_IOU_TH) # print("# Positive matches between Ref and Pred :", len(m_pos)) # print("# matches with Unknown events: ", len(m_unk)) - + tp = len(m_pos) fp = pred_1st_round.shape[1] - tp - len(m_unk) - + ## compute unmatched pos ref events: count_unmached_pos_ref_events = len(ref_pos_indexes) - tp @@ -90,114 +93,161 @@ def compute_tp_fp_fn(pred_events_df, ref_events_df): total_n_POS_events = len(ref_pos_indexes) return tp, fp, fn, total_n_POS_events -def compute_scores_per_class(counts_per_class): +def compute_scores_per_class(counts_per_class): scores_per_class = {} for cl in counts_per_class.keys(): tp = counts_per_class[cl]["TP"] fp = counts_per_class[cl]["FP"] fn = counts_per_class[cl]["FN"] - # to compute the harmonic mean we need to have all entries as non zero - precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made + precision = ( + tp / (tp + fp) if tp + fp != 0 else MIN_EVAL_VALUE + ) # case where no predictions were made if precision < MIN_EVAL_VALUE: precision = MIN_EVAL_VALUE - recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE - fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE + recall = tp / (fn + tp) if tp != 0 else MIN_EVAL_VALUE + fmeasure = tp / (tp + 0.5 * (fp + fn)) if tp != 0 else MIN_EVAL_VALUE - scores_per_class[cl] = {"precision": precision, "recall": recall, "f-measure": fmeasure} + scores_per_class[cl] = { + "precision": precision, + "recall": recall, + "f-measure": fmeasure, + } return scores_per_class - + + def compute_scores_from_counts(counts): tp = counts["TP"] fp = counts["FP"] fn = counts["FN"] # to compute the harmonic mean we need to have all entries as non zero - precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made + precision = ( + tp / (tp + fp) if tp + fp != 0 else MIN_EVAL_VALUE + ) # case where no predictions were made if precision < MIN_EVAL_VALUE: - precision = MIN_EVAL_VALUE - recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE - fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE + precision = MIN_EVAL_VALUE + recall = tp / (fn + tp) if tp != 0 else MIN_EVAL_VALUE + fmeasure = tp / (tp + 0.5 * (fp + fn)) if tp != 0 else MIN_EVAL_VALUE scores = {"precision": precision, "recall": recall, "f-measure": fmeasure} - - return scores + return scores -def build_report(main_set_scores, scores_per_miniset, scores_per_audiofile, save_path, main_set_name="EVAL", team_name="test_team" , **kwargs): - +def build_report( + main_set_scores, + scores_per_miniset, + scores_per_audiofile, + save_path, + main_set_name="EVAL", + team_name="test_team", + **kwargs +): # datetime object containing current date and time now = datetime.now() date_string = now.strftime("%d%m%Y_%H_%M_%S") - # print("date and time =", date_string) + # print("date and time =", date_string) - #make dict: + # make dict: report = { - 'team_name': team_name, - "set_name": main_set_name, - "report_date": date_string, - "overall_scores": main_set_scores, - "scores_per_subset": scores_per_miniset, - "scores_per_audiofile": scores_per_audiofile + "team_name": team_name, + "set_name": main_set_name, + "report_date": date_string, + "overall_scores": main_set_scores, + "scores_per_subset": scores_per_miniset, + "scores_per_audiofile": scores_per_audiofile, } if "scores_per_class" in kwargs.keys(): - report["scores_per_class"] = kwargs['scores_per_class'] - - with open(os.path.join(save_path,"Evaluation_report_" + team_name + "_" + main_set_name + '_' + date_string + '.json'), 'w') as outfile: + report["scores_per_class"] = kwargs["scores_per_class"] + + with open( + os.path.join( + save_path, + "Evaluation_report_" + + team_name + + "_" + + main_set_name + + "_" + + date_string + + ".json", + ), + "w", + ) as outfile: json.dump(report, outfile) return -def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metadata=[]): +def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metadata=[]): print("\nEvaluation for:", team_name, dataset) - #read Gt file structure: get subsets and paths for ref csvs make an inverted dictionary with audiofilenames as keys and folder as value + # read Gt file structure: get subsets and paths for ref csvs make an inverted dictionary with audiofilenames as keys and folder as value gt_file_structure = {} gt_file_structure[dataset] = {} inv_gt_file_structure = {} list_of_subsets = os.listdir(ref_file_path) for subset in list_of_subsets: - gt_file_structure[dataset][subset] = [os.path.basename(fl)[0:-4]+'.wav' for fl in glob.glob(os.path.join(ref_file_path,subset,"*.csv"))] + gt_file_structure[dataset][subset] = [ + os.path.basename(fl)[0:-4] + ".wav" + for fl in glob.glob(os.path.join(ref_file_path, subset, "*.csv")) + ] for audiofile in gt_file_structure[dataset][subset]: inv_gt_file_structure[audiofile] = subset - - #read prediction csv + # read prediction csv pred_csv = pd.read_csv(pred_file_path, dtype=str) - #verify headers: - if list(pred_csv.columns) != PRED_FILE_HEADER: - print('Please correct the header of the prediction file. This should be', PRED_FILE_HEADER) + # verify headers: + if list(pred_csv.columns) != PRED_FILE_HEADER: + print( + "Please correct the header of the prediction file. This should be", + PRED_FILE_HEADER, + ) exit(1) # parse prediction csv # split file into lists of events for the same audiofile. - pred_events_by_audiofile = dict(tuple(pred_csv.groupby('Audiofilename'))) + pred_events_by_audiofile = dict(tuple(pred_csv.groupby("Audiofilename"))) counts_per_audiofile = {} for audiofilename in list(pred_events_by_audiofile.keys()): - - # for each audiofile, load correcponding GT File (audiofilename.csv) - ref_events_this_audiofile_all = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype={'Starttime':np.float64, 'Endtime': np.float64}) + ref_events_this_audiofile_all = pd.read_csv( + os.path.join( + ref_file_path, + inv_gt_file_structure[audiofilename], + audiofilename[0:-4] + ".csv", + ), + dtype={"Starttime": np.float64, "Endtime": np.float64}, + ) # sort events by starttime: - ref_events_this_audiofile_all = ref_events_this_audiofile_all.sort_values(by='Starttime', axis=0, ascending=True) - - #Remove the 5 shots from GT: - ref_events_this_audiofile = remove_shots_from_ref(ref_events_this_audiofile_all, number_shots=N_SHOTS) - - # compare and get counts: TP, FP .. - tp_count, fp_count, fn_count , total_n_events_in_audiofile = compute_tp_fp_fn(pred_events_by_audiofile[audiofilename], ref_events_this_audiofile ) - - counts_per_audiofile[audiofilename] = {"TP": tp_count, "FP": fp_count, "FN": fn_count, "total_n_pos_events": total_n_events_in_audiofile} + ref_events_this_audiofile_all = ref_events_this_audiofile_all.sort_values( + by="Starttime", axis=0, ascending=True + ) + + # Remove the 5 shots from GT: + ref_events_this_audiofile = remove_shots_from_ref( + ref_events_this_audiofile_all, number_shots=N_SHOTS + ) + + # compare and get counts: TP, FP .. + tp_count, fp_count, fn_count, total_n_events_in_audiofile = compute_tp_fp_fn( + pred_events_by_audiofile[audiofilename], ref_events_this_audiofile + ) + + counts_per_audiofile[audiofilename] = { + "TP": tp_count, + "FP": fp_count, + "FN": fn_count, + "total_n_pos_events": total_n_events_in_audiofile, + } print(audiofilename, counts_per_audiofile[audiofilename]) if metadata: # using the key for classes => audiofiles, # load sets metadata: with open(metadata) as metadatafile: - dataset_metadata = json.load(metadatafile) + dataset_metadata = json.load(metadatafile) else: dataset_metadata = copy.deepcopy(gt_file_structure) @@ -206,22 +256,35 @@ def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metada for miniset in dataset_metadata[dataset].keys(): if metadata: for cl in dataset_metadata[dataset][miniset].keys(): - list_all_audiofiles.extend(dataset_metadata[dataset][miniset][cl] ) + list_all_audiofiles.extend(dataset_metadata[dataset][miniset][cl]) else: list_all_audiofiles.extend(dataset_metadata[dataset][miniset]) for audiofilename in list_all_audiofiles: if audiofilename not in counts_per_audiofile.keys(): - ref_events_this_audiofile = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype=str) + ref_events_this_audiofile = pd.read_csv( + os.path.join( + ref_file_path, + inv_gt_file_structure[audiofilename], + audiofilename[0:-4] + ".csv", + ), + dtype=str, + ) # sort ref_events by starttime - ref_events_this_audiofile = ref_events_this_audiofile.sort_values(by='Starttime', axis=0, ascending=True) - total_n_pos_events_in_audiofile = len(select_events_with_value(ref_events_this_audiofile, value=POS_VALUE)) - counts_per_audiofile[audiofilename] = {"TP": 0, "FP": 0, "FN": total_n_pos_events_in_audiofile, "total_n_pos_events": total_n_pos_events_in_audiofile} - - - - - # aggregate the counts per class or subset: + ref_events_this_audiofile = ref_events_this_audiofile.sort_values( + by="Starttime", axis=0, ascending=True + ) + total_n_pos_events_in_audiofile = len( + select_events_with_value(ref_events_this_audiofile, value=POS_VALUE) + ) + counts_per_audiofile[audiofilename] = { + "TP": 0, + "FP": 0, + "FN": total_n_pos_events_in_audiofile, + "total_n_pos_events": total_n_pos_events_in_audiofile, + } + + # aggregate the counts per class or subset: list_sets_in_mainset = list(dataset_metadata[dataset].keys()) counts_per_class_per_set = {} @@ -231,7 +294,7 @@ def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metada scores_per_audiofile = {} for data_set in list_sets_in_mainset: # print(data_set) - + if metadata: list_classes_in_set = list(dataset_metadata[dataset][data_set].keys()) @@ -248,7 +311,9 @@ def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metada fp = 0 total_n_pos_events_this_class = 0 for audiofile in list_audiofiles_this_class: - scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) + scores_per_audiofile[audiofile] = compute_scores_from_counts( + counts_per_audiofile[audiofile] + ) tp = tp + counts_per_audiofile[audiofile]["TP"] tp_set = tp_set + counts_per_audiofile[audiofile]["TP"] @@ -256,70 +321,126 @@ def evaluate(pred_file_path, ref_file_path, team_name, dataset, savepath, metada fn_set = fn_set + counts_per_audiofile[audiofile]["FN"] fp = fp + counts_per_audiofile[audiofile]["FP"] fp_set = fp_set + counts_per_audiofile[audiofile]["FP"] - total_n_pos_events_this_class = total_n_pos_events_this_class + counts_per_audiofile[audiofile]["total_n_pos_events"] - total_n_events_set = total_n_events_set + counts_per_audiofile[audiofile]["total_n_pos_events"] - + total_n_pos_events_this_class = ( + total_n_pos_events_this_class + + counts_per_audiofile[audiofile]["total_n_pos_events"] + ) + total_n_events_set = ( + total_n_events_set + + counts_per_audiofile[audiofile]["total_n_pos_events"] + ) + # counts_per_class[cl] = {"TP":tp, "FN": fn, "FP": fp, "total_n_pos_events_this_class": total_n_pos_events_this_class} - counts_per_class_per_set[data_set][cl] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_class": total_n_pos_events_this_class} - counts_per_set[data_set] = {"TP": tp_set, "FN": fn_set, "FP": fp_set, "total_n_pos_events_this_set": total_n_events_set} - - # compute scores per subset. - scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) + counts_per_class_per_set[data_set][cl] = { + "TP": tp, + "FN": fn, + "FP": fp, + "total_n_pos_events_this_class": total_n_pos_events_this_class, + } + counts_per_set[data_set] = { + "TP": tp_set, + "FN": fn_set, + "FP": fp_set, + "total_n_pos_events_this_set": total_n_events_set, + } + + # compute scores per subset. + scores_per_set[data_set] = compute_scores_from_counts( + counts_per_set[data_set] + ) # compute scores per class - scores_per_class_per_set[data_set] = compute_scores_per_class(counts_per_class_per_set[data_set]) - - + scores_per_class_per_set[data_set] = compute_scores_per_class( + counts_per_class_per_set[data_set] + ) + else: list_audiofiles_in_set = dataset_metadata[dataset][data_set] tp = 0 fn = 0 fp = 0 total_n_pos_events_this_set = 0 - for audiofile in list_audiofiles_in_set: - - scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) + for audiofile in list_audiofiles_in_set: + scores_per_audiofile[audiofile] = compute_scores_from_counts( + counts_per_audiofile[audiofile] + ) tp = tp + counts_per_audiofile[audiofile]["TP"] fn = fn + counts_per_audiofile[audiofile]["FN"] fp = fp + counts_per_audiofile[audiofile]["FP"] - total_n_pos_events_this_set = total_n_pos_events_this_set + counts_per_audiofile[audiofile]["total_n_pos_events"] - counts_per_set[data_set] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_set": total_n_pos_events_this_set} - + total_n_pos_events_this_set = ( + total_n_pos_events_this_set + + counts_per_audiofile[audiofile]["total_n_pos_events"] + ) + counts_per_set[data_set] = { + "TP": tp, + "FN": fn, + "FP": fp, + "total_n_pos_events_this_set": total_n_pos_events_this_set, + } + # compute scores per subset - scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) - - overall_scores = {"precision" : stats.hmean([scores_per_set[dt]["precision"] for dt in scores_per_set.keys()]), - "recall": stats.hmean([scores_per_set[dt]["recall"] for dt in scores_per_set.keys()]) , - "fmeasure (percentage)": np.round(stats.hmean([scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()])*100, 3) - } - - print("\nOverall_scores:", overall_scores) + scores_per_set[data_set] = compute_scores_from_counts( + counts_per_set[data_set] + ) + + overall_scores = { + "precision": stats.hmean( + [scores_per_set[dt]["precision"] for dt in scores_per_set.keys()] + ), + "recall": stats.hmean( + [scores_per_set[dt]["recall"] for dt in scores_per_set.keys()] + ), + "fmeasure (percentage)": np.round( + stats.hmean( + [scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()] + ) + * 100, + 3, + ), + } + + print("\nOverall_scores:", overall_scores) print("\nwriting report") if metadata: - build_report(overall_scores, scores_per_set, scores_per_audiofile, - savepath, - dataset, - team_name, - scores_per_class=scores_per_class_per_set) + build_report( + overall_scores, + scores_per_set, + scores_per_audiofile, + savepath, + dataset, + team_name, + scores_per_class=scores_per_class_per_set, + ) else: - build_report(overall_scores, scores_per_set, scores_per_audiofile, - savepath, - dataset, - team_name) - + build_report( + overall_scores, + scores_per_set, + scores_per_audiofile, + savepath, + dataset, + team_name, + ) + return if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-pred_file', type=str, help='csv predictions file') - parser.add_argument('-ref_files_path', type=str, help='path to the ground truth csvs folder') - parser.add_argument('-metadata', type=str, help="path for metadata json. Participants may ignore this option.") - parser.add_argument('-team_name', type=str, help='team identification') - parser.add_argument('-dataset', type=str, help="which set to evaluate: EVAL or VAL") - parser.add_argument('-savepath', type=str, help="path where to save the report to") + parser.add_argument("-pred_file", type=str, help="csv predictions file") + parser.add_argument( + "-ref_files_path", type=str, help="path to the ground truth csvs folder" + ) + parser.add_argument( + "-metadata", + type=str, + help="path for metadata json. Participants may ignore this option.", + ) + parser.add_argument("-team_name", type=str, help="team identification") + parser.add_argument("-dataset", type=str, help="which set to evaluate: EVAL or VAL") + parser.add_argument("-savepath", type=str, help="path where to save the report to") args = parser.parse_args() - evaluate( args.pred_file, args.ref_files_path, args.team_name, args.dataset, args.savepath) + evaluate( + args.pred_file, args.ref_files_path, args.team_name, args.dataset, args.savepath + ) - # docker run -v $PWD:/app -v /data/Prosjekter3/823001_19_metodesats_analyse_23_36_cretois/:/data --gpus all beats poetry run python evaluate/evaluation_metrics/evaluation.py -pred_file /data/eval_out.csv -ref_files_path /data/DCASE/Development_Set_annotations/Validation_Set -team_name BEATs -dataset VAL -savepath /data/. \ No newline at end of file + # docker run -v $PWD:/app -v /data/Prosjekter3/823001_19_metodesats_analyse_23_36_cretois/:/data --gpus all beats poetry run python evaluate/evaluation_metrics/evaluation.py -pred_file /data/eval_out.csv -ref_files_path /data/DCASE/Development_Set_annotations/Validation_Set -team_name BEATs -dataset VAL -savepath /data/. diff --git a/evaluate/evaluation_metrics/evaluation_confidence_intervals.py b/evaluate/evaluation_metrics/evaluation_confidence_intervals.py index d041a0c..ad66899 100644 --- a/evaluate/evaluation_metrics/evaluation_confidence_intervals.py +++ b/evaluate/evaluation_metrics/evaluation_confidence_intervals.py @@ -14,30 +14,32 @@ MIN_EVAL_VALUE = 0.00001 N_SHOTS = 5 MIN_IOU_TH = 0.3 -PRED_FILE_HEADER = ["Audiofilename","Starttime","Endtime"] -POS_VALUE = 'POS' -UNK_VALUE = 'UNK' +PRED_FILE_HEADER = ["Audiofilename", "Starttime", "Endtime"] +POS_VALUE = "POS" +UNK_VALUE = "UNK" + def remove_shots_from_ref(ref_df, number_shots=5): - ref_pos_indexes = select_events_with_value(ref_df, value=POS_VALUE) - ref_n_shot_index = ref_pos_indexes[number_shots-1] + ref_n_shot_index = ref_pos_indexes[number_shots - 1] # remove all events (pos and UNK) that happen before this 5th event - events_to_drop = ref_df.index[ref_df['Endtime'] <= ref_df.iloc[ref_n_shot_index]['Endtime']].tolist() + events_to_drop = ref_df.index[ + ref_df["Endtime"] <= ref_df.iloc[ref_n_shot_index]["Endtime"] + ].tolist() return ref_df.drop(events_to_drop) -def select_events_with_value(data_frame, value=POS_VALUE): +def select_events_with_value(data_frame, value=POS_VALUE): indexes_list = data_frame.index[data_frame["Q"] == value].tolist() return indexes_list -def build_matrix_from_selected_rows(data_frame, selected_indexes_list ): - matrix_data = np.ones((2, len(selected_indexes_list)))* -1 +def build_matrix_from_selected_rows(data_frame, selected_indexes_list): + matrix_data = np.ones((2, len(selected_indexes_list))) * -1 for n, idx in enumerate(selected_indexes_list): - matrix_data[0, n] = data_frame.loc[idx].Starttime # start time for event n + matrix_data[0, n] = data_frame.loc[idx].Starttime # start time for event n matrix_data[1, n] = data_frame.loc[idx].Endtime return matrix_data @@ -57,32 +59,33 @@ def compute_tp_fp_fn(pred_events_df, ref_events_df): if "Q" not in pred_events_df.columns: pred_events_df["Q"] = POS_VALUE - #sort events by starttime - pred_events_df = pred_events_df.sort_values(by='Starttime', axis=0, ascending=True) + # sort events by starttime + pred_events_df = pred_events_df.sort_values(by="Starttime", axis=0, ascending=True) pred_pos_indexes = select_events_with_value(pred_events_df, value=POS_VALUE) ref_1st_round = build_matrix_from_selected_rows(ref_events_df, ref_pos_indexes) pred_1st_round = build_matrix_from_selected_rows(pred_events_df, pred_pos_indexes) m_pos = metrics.match_events(ref_1st_round, pred_1st_round, min_iou=MIN_IOU_TH) - matched_ref_indexes = [ri for ri, pi in m_pos] + matched_ref_indexes = [ri for ri, pi in m_pos] matched_pred_indexes = [pi for ri, pi in m_pos] - ref_unk_indexes = select_events_with_value(ref_events_df, value=UNK_VALUE) ref_2nd_round = build_matrix_from_selected_rows(ref_events_df, ref_unk_indexes) - unmatched_pred_events = list(set(range(pred_1st_round.shape[1])) - set(matched_pred_indexes)) + unmatched_pred_events = list( + set(range(pred_1st_round.shape[1])) - set(matched_pred_indexes) + ) pred_2nd_round = pred_1st_round[:, unmatched_pred_events] m_unk = metrics.match_events(ref_2nd_round, pred_2nd_round, min_iou=MIN_IOU_TH) # print("# Positive matches between Ref and Pred :", len(m_pos)) # print("# matches with Unknown events: ", len(m_unk)) - + tp = len(m_pos) fp = pred_1st_round.shape[1] - tp - len(m_unk) - + ## compute unmatched pos ref events: count_unmached_pos_ref_events = len(ref_pos_indexes) - tp @@ -91,114 +94,174 @@ def compute_tp_fp_fn(pred_events_df, ref_events_df): total_n_POS_events = len(ref_pos_indexes) return tp, fp, fn, total_n_POS_events -def compute_scores_per_class(counts_per_class): +def compute_scores_per_class(counts_per_class): scores_per_class = {} for cl in counts_per_class.keys(): tp = counts_per_class[cl]["TP"] fp = counts_per_class[cl]["FP"] fn = counts_per_class[cl]["FN"] - # to compute the harmonic mean we need to have all entries as non zero - precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made + precision = ( + tp / (tp + fp) if tp + fp != 0 else MIN_EVAL_VALUE + ) # case where no predictions were made if precision < MIN_EVAL_VALUE: precision = MIN_EVAL_VALUE - recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE - fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE + recall = tp / (fn + tp) if tp != 0 else MIN_EVAL_VALUE + fmeasure = tp / (tp + 0.5 * (fp + fn)) if tp != 0 else MIN_EVAL_VALUE - scores_per_class[cl] = {"precision": precision, "recall": recall, "f-measure": fmeasure} + scores_per_class[cl] = { + "precision": precision, + "recall": recall, + "f-measure": fmeasure, + } return scores_per_class - + + def compute_scores_from_counts(counts): tp = counts["TP"] fp = counts["FP"] fn = counts["FN"] # to compute the harmonic mean we need to have all entries as non zero - precision = tp/(tp+fp) if tp+fp != 0 else MIN_EVAL_VALUE # case where no predictions were made + precision = ( + tp / (tp + fp) if tp + fp != 0 else MIN_EVAL_VALUE + ) # case where no predictions were made if precision < MIN_EVAL_VALUE: - precision = MIN_EVAL_VALUE - recall = tp/(fn+tp) if tp != 0 else MIN_EVAL_VALUE - fmeasure = tp/(tp+0.5*(fp+fn)) if tp != 0 else MIN_EVAL_VALUE + precision = MIN_EVAL_VALUE + recall = tp / (fn + tp) if tp != 0 else MIN_EVAL_VALUE + fmeasure = tp / (tp + 0.5 * (fp + fn)) if tp != 0 else MIN_EVAL_VALUE scores = {"precision": precision, "recall": recall, "f-measure": fmeasure} - - return scores + return scores -def build_report(main_set_scores, scores_per_miniset, scores_per_audiofile, save_path, main_set_name="EVAL", team_name="test_team" , **kwargs): - +def build_report( + main_set_scores, + scores_per_miniset, + scores_per_audiofile, + save_path, + main_set_name="EVAL", + team_name="test_team", + **kwargs +): # datetime object containing current date and time now = datetime.now() date_string = now.strftime("%d%m%Y_%H_%M_%S") - # print("date and time =", date_string) + # print("date and time =", date_string) - #make dict: + # make dict: report = { - 'team_name': team_name, - "set_name": main_set_name, - "report_date": date_string, - "overall_scores": main_set_scores, - "scores_per_subset": scores_per_miniset, - "scores_per_audiofile": scores_per_audiofile + "team_name": team_name, + "set_name": main_set_name, + "report_date": date_string, + "overall_scores": main_set_scores, + "scores_per_subset": scores_per_miniset, + "scores_per_audiofile": scores_per_audiofile, } if "scores_per_class" in kwargs.keys(): - report["scores_per_class"] = kwargs['scores_per_class'] - - with open(os.path.join(save_path,"Evaluation_report_" + team_name + "_" + main_set_name + '_' + date_string + '.json'), 'w') as outfile: + report["scores_per_class"] = kwargs["scores_per_class"] + + with open( + os.path.join( + save_path, + "Evaluation_report_" + + team_name + + "_" + + main_set_name + + "_" + + date_string + + ".json", + ), + "w", + ) as outfile: json.dump(report, outfile) return -def build_mini_report_bootstrapped_results(low, high, meanFmeasure, low_precision, mean_precision, high_precision, low_recall, - mean_recall, high_recall, save_path, main_set_name="EVAL", team_name="test_team" ): - +def build_mini_report_bootstrapped_results( + low, + high, + meanFmeasure, + low_precision, + mean_precision, + high_precision, + low_recall, + mean_recall, + high_recall, + save_path, + main_set_name="EVAL", + team_name="test_team", +): # datetime object containing current date and time now = datetime.now() date_string = now.strftime("%d%m%Y_%H_%M_%S") - # print("date and time =", date_string) + # print("date and time =", date_string) - #make dict: + # make dict: report = { - 'team_name': team_name, - "set_name": main_set_name, - "report_date": date_string, - "fmeasure": {"low": low, "mean": meanFmeasure, "high":high}, - "precision": {"low": low_precision, "mean": mean_precision, "high":high_precision}, - "recall": {"low": low_recall, "mean": mean_recall, "high":high_recall} + "team_name": team_name, + "set_name": main_set_name, + "report_date": date_string, + "fmeasure": {"low": low, "mean": meanFmeasure, "high": high}, + "precision": { + "low": low_precision, + "mean": mean_precision, + "high": high_precision, + }, + "recall": {"low": low_recall, "mean": mean_recall, "high": high_recall}, } - - with open(os.path.join(save_path,"Evaluation_report_" + team_name + "_" + main_set_name + '_' + date_string + '.json'), 'w') as outfile: + + with open( + os.path.join( + save_path, + "Evaluation_report_" + + team_name + + "_" + + main_set_name + + "_" + + date_string + + ".json", + ), + "w", + ) as outfile: json.dump(report, outfile) return -def evaluate_bootstrapped(pred_file_path, ref_file_path, team_name, dataset, savepath, bootstraps = 1000): - #computes overall scores with 95% confidence intervals - #generates report. +def evaluate_bootstrapped( + pred_file_path, ref_file_path, team_name, dataset, savepath, bootstraps=1000 +): + # computes overall scores with 95% confidence intervals + # generates report. print("\nEvaluation for:", team_name, dataset) - #read Gt file structure: get subsets and paths for ref csvs make an inverted dictionary with audiofilenames as keys and folder as value + # read Gt file structure: get subsets and paths for ref csvs make an inverted dictionary with audiofilenames as keys and folder as value gt_file_structure = {} gt_file_structure[dataset] = {} inv_gt_file_structure = {} list_of_subsets = os.listdir(ref_file_path) for subset in list_of_subsets: - gt_file_structure[dataset][subset] = [os.path.basename(fl)[0:-4]+'.wav' for fl in glob.glob(os.path.join(ref_file_path,subset,"*.csv"))] + gt_file_structure[dataset][subset] = [ + os.path.basename(fl)[0:-4] + ".wav" + for fl in glob.glob(os.path.join(ref_file_path, subset, "*.csv")) + ] for audiofile in gt_file_structure[dataset][subset]: inv_gt_file_structure[audiofile] = subset - - #read prediction csv + # read prediction csv pred_csv = pd.read_csv(pred_file_path, dtype=str) - #verify headers: - if list(pred_csv.columns) != PRED_FILE_HEADER: - print('Please correct the header of the prediction file. This should be', PRED_FILE_HEADER) + # verify headers: + if list(pred_csv.columns) != PRED_FILE_HEADER: + print( + "Please correct the header of the prediction file. This should be", + PRED_FILE_HEADER, + ) exit(1) overall_fmeasures_bootstrapped = [] @@ -208,29 +271,52 @@ def evaluate_bootstrapped(pred_file_path, ref_file_path, team_name, dataset, sav for bi in tqdm(range(bootstraps)): # parse prediction csv - #remove predictions for bootstrapping - prediction_indexes_to_remove = np.random.choice(range(len(pred_csv)), round(0.05*len(pred_csv)), replace=False ) + # remove predictions for bootstrapping + prediction_indexes_to_remove = np.random.choice( + range(len(pred_csv)), round(0.05 * len(pred_csv)), replace=False + ) pred_csv_new = pred_csv.drop(prediction_indexes_to_remove) # split remaining predictions into lists of events for the same audiofile. - pred_events_by_audiofile = dict(tuple(pred_csv_new.groupby('Audiofilename'))) + pred_events_by_audiofile = dict(tuple(pred_csv_new.groupby("Audiofilename"))) counts_per_audiofile = {} for audiofilename in list(pred_events_by_audiofile.keys()): - - # for each audiofile, load correcponding GT File (audiofilename.csv) - ref_events_this_audiofile_all = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype={'Starttime':np.float64, 'Endtime': np.float64}) + ref_events_this_audiofile_all = pd.read_csv( + os.path.join( + ref_file_path, + inv_gt_file_structure[audiofilename], + audiofilename[0:-4] + ".csv", + ), + dtype={"Starttime": np.float64, "Endtime": np.float64}, + ) # sort events by starttime: - ref_events_this_audiofile_all = ref_events_this_audiofile_all.sort_values(by='Starttime', axis=0, ascending=True) - - #Remove the 5 shots from GT: - ref_events_this_audiofile = remove_shots_from_ref(ref_events_this_audiofile_all, number_shots=N_SHOTS) - - # compare and get counts: TP, FP .. - tp_count, fp_count, fn_count , total_n_events_in_audiofile = compute_tp_fp_fn(pred_events_by_audiofile[audiofilename], ref_events_this_audiofile ) - - counts_per_audiofile[audiofilename] = {"TP": tp_count, "FP": fp_count, "FN": fn_count, "total_n_pos_events": total_n_events_in_audiofile} + ref_events_this_audiofile_all = ref_events_this_audiofile_all.sort_values( + by="Starttime", axis=0, ascending=True + ) + + # Remove the 5 shots from GT: + ref_events_this_audiofile = remove_shots_from_ref( + ref_events_this_audiofile_all, number_shots=N_SHOTS + ) + + # compare and get counts: TP, FP .. + ( + tp_count, + fp_count, + fn_count, + total_n_events_in_audiofile, + ) = compute_tp_fp_fn( + pred_events_by_audiofile[audiofilename], ref_events_this_audiofile + ) + + counts_per_audiofile[audiofilename] = { + "TP": tp_count, + "FP": fp_count, + "FN": fn_count, + "total_n_pos_events": total_n_events_in_audiofile, + } # print(audiofilename, counts_per_audiofile[audiofilename]) dataset_metadata = copy.deepcopy(gt_file_structure) @@ -242,13 +328,29 @@ def evaluate_bootstrapped(pred_file_path, ref_file_path, team_name, dataset, sav for audiofilename in list_all_audiofiles: if audiofilename not in counts_per_audiofile.keys(): - ref_events_this_audiofile = pd.read_csv(os.path.join(ref_file_path, inv_gt_file_structure[audiofilename], audiofilename[0:-4]+'.csv'), dtype=str) + ref_events_this_audiofile = pd.read_csv( + os.path.join( + ref_file_path, + inv_gt_file_structure[audiofilename], + audiofilename[0:-4] + ".csv", + ), + dtype=str, + ) # sort ref_events by starttime - ref_events_this_audiofile = ref_events_this_audiofile.sort_values(by='Starttime', axis=0, ascending=True) - total_n_pos_events_in_audiofile = len(select_events_with_value(ref_events_this_audiofile, value=POS_VALUE)) - counts_per_audiofile[audiofilename] = {"TP": 0, "FP": 0, "FN": total_n_pos_events_in_audiofile, "total_n_pos_events": total_n_pos_events_in_audiofile} - - # aggregate the counts per class or subset: + ref_events_this_audiofile = ref_events_this_audiofile.sort_values( + by="Starttime", axis=0, ascending=True + ) + total_n_pos_events_in_audiofile = len( + select_events_with_value(ref_events_this_audiofile, value=POS_VALUE) + ) + counts_per_audiofile[audiofilename] = { + "TP": 0, + "FP": 0, + "FN": total_n_pos_events_in_audiofile, + "total_n_pos_events": total_n_pos_events_in_audiofile, + } + + # aggregate the counts per class or subset: list_sets_in_mainset = list(dataset_metadata[dataset].keys()) counts_per_class_per_set = {} @@ -258,63 +360,100 @@ def evaluate_bootstrapped(pred_file_path, ref_file_path, team_name, dataset, sav scores_per_audiofile = {} for data_set in list_sets_in_mainset: # print(data_set) - + list_audiofiles_in_set = dataset_metadata[dataset][data_set] tp = 0 fn = 0 fp = 0 total_n_pos_events_this_set = 0 - for audiofile in list_audiofiles_in_set: - - scores_per_audiofile[audiofile] = compute_scores_from_counts(counts_per_audiofile[audiofile]) + for audiofile in list_audiofiles_in_set: + scores_per_audiofile[audiofile] = compute_scores_from_counts( + counts_per_audiofile[audiofile] + ) tp = tp + counts_per_audiofile[audiofile]["TP"] fn = fn + counts_per_audiofile[audiofile]["FN"] fp = fp + counts_per_audiofile[audiofile]["FP"] - total_n_pos_events_this_set = total_n_pos_events_this_set + counts_per_audiofile[audiofile]["total_n_pos_events"] - counts_per_set[data_set] = {"TP": tp, "FN": fn, "FP": fp, "total_n_pos_events_this_set": total_n_pos_events_this_set} - + total_n_pos_events_this_set = ( + total_n_pos_events_this_set + + counts_per_audiofile[audiofile]["total_n_pos_events"] + ) + counts_per_set[data_set] = { + "TP": tp, + "FN": fn, + "FP": fp, + "total_n_pos_events_this_set": total_n_pos_events_this_set, + } + # compute scores per subset - scores_per_set[data_set] = compute_scores_from_counts(counts_per_set[data_set]) - - overall_scores = {"precision" : stats.hmean([scores_per_set[dt]["precision"] for dt in scores_per_set.keys()]), - "recall": stats.hmean([scores_per_set[dt]["recall"] for dt in scores_per_set.keys()]) , - "fmeasure (percentage)": np.round(stats.hmean([scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()])*100, 3) - } + scores_per_set[data_set] = compute_scores_from_counts( + counts_per_set[data_set] + ) + + overall_scores = { + "precision": stats.hmean( + [scores_per_set[dt]["precision"] for dt in scores_per_set.keys()] + ), + "recall": stats.hmean( + [scores_per_set[dt]["recall"] for dt in scores_per_set.keys()] + ), + "fmeasure (percentage)": np.round( + stats.hmean( + [scores_per_set[dt]["f-measure"] for dt in scores_per_set.keys()] + ) + * 100, + 3, + ), + } overall_fmeasures_bootstrapped.append(overall_scores["fmeasure (percentage)"]) overall_precision_bootstrapped.append(overall_scores["precision"]) overall_recall_bootstrapped.append(overall_scores["recall"]) - - overall_high_fmeasure = np.percentile(overall_fmeasures_bootstrapped,97.5) + + overall_high_fmeasure = np.percentile(overall_fmeasures_bootstrapped, 97.5) overall_low_fmeasure = np.percentile(overall_fmeasures_bootstrapped, 2.5) overall_mean_fmeasure = np.mean(overall_fmeasures_bootstrapped) - overall_high_precision = np.percentile(overall_precision_bootstrapped,97.5) + overall_high_precision = np.percentile(overall_precision_bootstrapped, 97.5) overall_low_precision = np.percentile(overall_precision_bootstrapped, 2.5) overall_mean_precision = np.mean(overall_precision_bootstrapped) - overall_high_recall = np.percentile(overall_recall_bootstrapped,97.5) + overall_high_recall = np.percentile(overall_recall_bootstrapped, 97.5) overall_low_recall = np.percentile(overall_recall_bootstrapped, 2.5) overall_mean_recall = np.mean(overall_recall_bootstrapped) # print(overall_low_fmeasure, '<',overall_mean_fmeasure, '<', overall_high_fmeasure ) # print('min:', np.min(overall_fmeasures_bootstrapped), '------ max:', np.max(overall_fmeasures_bootstrapped)) - build_mini_report_bootstrapped_results(overall_low_fmeasure, overall_high_fmeasure, overall_mean_fmeasure, - overall_high_precision, overall_low_precision, overall_mean_precision, overall_high_recall, - overall_low_recall, overall_mean_recall, savepath, dataset, team_name) - + build_mini_report_bootstrapped_results( + overall_low_fmeasure, + overall_high_fmeasure, + overall_mean_fmeasure, + overall_high_precision, + overall_low_precision, + overall_mean_precision, + overall_high_recall, + overall_low_recall, + overall_mean_recall, + savepath, + dataset, + team_name, + ) + return if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-pred_file', type=str, help='csv predictions file') - parser.add_argument('-ref_files_path', type=str, help='path to the ground truth csvs folder') + parser.add_argument("-pred_file", type=str, help="csv predictions file") + parser.add_argument( + "-ref_files_path", type=str, help="path to the ground truth csvs folder" + ) # parser.add_argument('-metadata', type=str, help="path for metadata json. Participants may ignore this option.") - parser.add_argument('-team_name', type=str, help='team identification') - parser.add_argument('-dataset', type=str, help="which set to evaluate: EVAL or VAL") - parser.add_argument('-savepath', type=str, help="path where to save the report to") + parser.add_argument("-team_name", type=str, help="team identification") + parser.add_argument("-dataset", type=str, help="which set to evaluate: EVAL or VAL") + parser.add_argument("-savepath", type=str, help="path where to save the report to") args = parser.parse_args() - - - evaluate_bootstrapped(args.pred_file_path,args.ref_file_path, args.team_name, args.dataset, args.savepath) - \ No newline at end of file + evaluate_bootstrapped( + args.pred_file_path, + args.ref_file_path, + args.team_name, + args.dataset, + args.savepath, + ) diff --git a/evaluate/evaluation_metrics/metrics.py b/evaluate/evaluation_metrics/metrics.py index 2514b6d..35cc545 100644 --- a/evaluate/evaluation_metrics/metrics.py +++ b/evaluate/evaluation_metrics/metrics.py @@ -210,4 +210,4 @@ def slow_intersect(ref, est): ] ) ) - return matches \ No newline at end of file + return matches diff --git a/prototypicalbeats/BEATs_on_miniECS50/data_utils/miniESC50.py b/prototypicalbeats/BEATs_on_miniECS50/data_utils/miniESC50.py index a9ff3b2..12e95dc 100755 --- a/prototypicalbeats/BEATs_on_miniECS50/data_utils/miniESC50.py +++ b/prototypicalbeats/BEATs_on_miniECS50/data_utils/miniESC50.py @@ -4,21 +4,26 @@ import os import shutil + def few_shot_sample(data_frame: pd.DataFrame, classes: list, n_samples: int, seed: int): cat_dfs = [] for c in classes: - cat_df = data_frame[data_frame["category"] == c].sample(n_samples, random_state=seed) + cat_df = data_frame[data_frame["category"] == c].sample( + n_samples, random_state=seed + ) cat_dfs.append(cat_df) full_df = pd.concat(cat_dfs) return full_df + def copy_to_folder(data_frame: pd.DataFrame, target_folder: str): - for i in range(0,len(data_frame)): + for i in range(0, len(data_frame)): fpath = data_frame.iloc[i]["filepath"] fname = data_frame.iloc[i]["filename"] outpath = os.path.join(target_folder, fname) shutil.copy(fpath, outpath) + def split_data(data_frame, train_samples, val_samples): train_dfs = [] val_dfs = [] @@ -37,8 +42,8 @@ def split_data(data_frame, train_samples, val_samples): test_df = pd.concat(test_dfs) return train_df, val_df, test_df -if __name__ == "__main__": +if __name__ == "__main__": root_dir = "/data/ESC-50-master" csv_file = "/data/ESC-50-master/meta/esc50.csv" target_path = "/data/ESC50mini" @@ -47,7 +52,7 @@ def split_data(data_frame, train_samples, val_samples): fpath_list = [] - for i in range(0,len(data_frame)): + for i in range(0, len(data_frame)): # List all the full path fname = data_frame.iloc[i]["filename"] fpath = os.path.join(root_dir, "audio", fname) @@ -85,4 +90,3 @@ def split_data(data_frame, train_samples, val_samples): copy_to_folder(train_df, os.path.join(target_path, "audio/train")) copy_to_folder(val_df, os.path.join(target_path, "audio/val")) copy_to_folder(test_df, os.path.join(target_path, "audio/test")) - diff --git a/prototypicalbeats/BEATs_on_miniECS50/evaluate.py b/prototypicalbeats/BEATs_on_miniECS50/evaluate.py index 75e8407..14f3abb 100644 --- a/prototypicalbeats/BEATs_on_miniECS50/evaluate.py +++ b/prototypicalbeats/BEATs_on_miniECS50/evaluate.py @@ -15,6 +15,7 @@ from prototypicalbeats.prototraining import ProtoBEATsModel from BEATs_on_miniECS50.miniECS50DataModule import miniECS50DataModule + def evaluate_on_one_task( support_images: torch.Tensor, support_labels: torch.Tensor, @@ -22,7 +23,7 @@ def evaluate_on_one_task( query_labels: torch.Tensor, ) -> [int, int, torch.Tensor, torch.Tensor]: """ - Returns the number of correct predictions of query labels, the total + Returns the number of correct predictions of query labels, the total number of predictions, and the coordinates of the prototypes and query images. """ prototypes = model.get_prototypes(support_images.cuda(), support_labels.cuda()) @@ -30,18 +31,25 @@ def evaluate_on_one_task( query_embeddings = query_embeddings.detach().cpu() prototypes = prototypes.detach().cpu() return ( - torch.max( - model( - support_images.cuda(), - support_labels.cuda(), - query_images.cuda(), - ) - .detach() - .data, - 1, - )[1] - == query_labels.cuda() - ).sum().item(), len(query_labels), prototypes, query_embeddings + ( + torch.max( + model( + support_images.cuda(), + support_labels.cuda(), + query_images.cuda(), + ) + .detach() + .data, + 1, + )[1] + == query_labels.cuda() + ) + .sum() + .item(), + len(query_labels), + prototypes, + query_embeddings, + ) def evaluate(data_loader: DataLoader): @@ -63,7 +71,6 @@ def evaluate(data_loader: DataLoader): query_labels, class_ids, ) in tqdm(enumerate(data_loader), total=len(data_loader)): - correct, total, prototypes, query_embeddings = evaluate_on_one_task( support_images, support_labels, query_images, query_labels ) @@ -84,26 +91,36 @@ def evaluate(data_loader: DataLoader): return all_prototypes, all_query_embeddings, all_query_labels + def get_2d_features(features, perplexity): return TSNE(n_components=2, perplexity=perplexity).fit_transform(features) -def get_figure(features_2d, labels, fig_name): +def get_figure(features_2d, labels, fig_name): query_2d = features_2d[5:] query_labels = labels[5:] proto_2d = features_2d[:5] proto_labels = labels[:5] - fig = sns.scatterplot(x=query_2d[:, 0], y=query_2d[:, 1], hue=query_labels, palette="deep") - sns.scatterplot(x=proto_2d[:, 0], y=proto_2d[:, 1], hue=proto_labels, palette="deep", marker='s', s=100) - + fig = sns.scatterplot( + x=query_2d[:, 0], y=query_2d[:, 1], hue=query_labels, palette="deep" + ) + sns.scatterplot( + x=proto_2d[:, 0], + y=proto_2d[:, 1], + hue=proto_labels, + palette="deep", + marker="s", + s=100, + ) + sns.move_legend(fig, "upper left", bbox_to_anchor=(1, 1)) fig.get_figure().savefig(fig_name, bbox_inches="tight") plt.show() -if __name__ == "__main__": +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -120,23 +137,38 @@ def get_figure(features_2d, labels, fig_name): model = ProtoBEATsModel() else: model = ProtoBEATsModel() - checkpoints = torch.load(cli_args.model_path) # "/app/prototypicalbeats/lightning_logs/version_49/checkpoints/epoch=14-step=1500.ckpt" + checkpoints = torch.load( + cli_args.model_path + ) # "/app/prototypicalbeats/lightning_logs/version_49/checkpoints/epoch=14-step=1500.ckpt" model.load_state_dict(checkpoints["state_dict"]) test_loader = miniECS50DataModule().test_dataloader() all_prototypes, all_query_embeddings, all_query_labels = evaluate(test_loader) # Reshape all_prototypes and all_query_embeddings - all_prototypes_r = all_prototypes[:,-1,:].reshape(10,5,768) - all_query_embeddings_r = all_query_embeddings[:,-1,:].reshape(10,100,768) + all_prototypes_r = all_prototypes[:, -1, :].reshape(10, 5, 768) + all_query_embeddings_r = all_query_embeddings[:, -1, :].reshape(10, 100, 768) # Select a particular embedding - IN WORK MEAN ACROSS EACH TENSOR - prototype_s = all_prototypes_r[1,:,:] - query_embeddings_s = all_query_embeddings_r[1,:,:] + prototype_s = all_prototypes_r[1, :, :] + query_embeddings_s = all_query_embeddings_r[1, :, :] query_labels_s = all_query_labels_r = all_query_labels[100:200] proto_query = torch.cat([prototype_s, query_embeddings_s]) - all_labels = torch.cat([torch.tensor([5,5,5,5,5,]), query_labels_s]) + all_labels = torch.cat( + [ + torch.tensor( + [ + 5, + 5, + 5, + 5, + 5, + ] + ), + query_labels_s, + ] + ) features_2d = get_2d_features(proto_query, perplexity=7) - get_figure(features_2d, all_labels, "protoembeddings.png") \ No newline at end of file + get_figure(features_2d, all_labels, "protoembeddings.png") diff --git a/prototypicalbeats/BEATs_on_miniECS50/miniECS50DataModule.py b/prototypicalbeats/BEATs_on_miniECS50/miniECS50DataModule.py index 1520c5e..8e693ad 100644 --- a/prototypicalbeats/BEATs_on_miniECS50/miniECS50DataModule.py +++ b/prototypicalbeats/BEATs_on_miniECS50/miniECS50DataModule.py @@ -9,7 +9,10 @@ from pytorch_lightning import LightningDataModule from data_utils.dataset import TaskSampler, AudioDataset -def few_shot_dataloader(root_dir, data_frame, n_way, n_shot, n_query, n_tasks, transform = None): + +def few_shot_dataloader( + root_dir, data_frame, n_way, n_shot, n_query, n_tasks, transform=None +): """ root_dir: directory where the audio data is stored data_frame: path to the label file @@ -17,25 +20,23 @@ def few_shot_dataloader(root_dir, data_frame, n_way, n_shot, n_query, n_tasks, t n_shot: number of images PER CLASS in the support set n_query: number of images PER CLASSS in the query set n_tasks: number of episodes (number of times the loader gives the data during a training step) - """ - - df = AudioDataset( - root_dir=root_dir, data_frame=data_frame, transform=transform - ) + """ + + df = AudioDataset(root_dir=root_dir, data_frame=data_frame, transform=transform) sampler = TaskSampler( - df, - n_way=n_way, # number of classes - n_shot=n_shot, # Number of images PER CLASS in the support set - n_query=n_query, # Number of images PER CLASSS in the query set - n_tasks=n_tasks # Not sure + df, + n_way=n_way, # number of classes + n_shot=n_shot, # Number of images PER CLASS in the support set + n_query=n_query, # Number of images PER CLASSS in the query set + n_tasks=n_tasks, # Not sure ) loader = DataLoader( df, batch_sampler=sampler, pin_memory=False, - collate_fn=sampler.episodic_collate_fn + collate_fn=sampler.episodic_collate_fn, ) return loader @@ -52,7 +53,7 @@ def __init__( csv_file_test: str = "/data/ESC50mini/meta/esc50mini_test.csv", n_task_train: int = 100, n_task_val: int = 100, - n_task_test: int = 10 , + n_task_test: int = 10, transform=None, **kwargs ): @@ -74,41 +75,42 @@ def prepare_data(self): pass def setup(self, stage=None): - self.train_set= pd.read_csv(self.csv_file_train) + self.train_set = pd.read_csv(self.csv_file_train) self.val_set = pd.read_csv(self.csv_file_val) self.test_set = pd.read_csv(self.csv_file_test) def train_dataloader(self): - - train_loader = few_shot_dataloader(self.root_dir_train, - self.train_set, - n_way=5, - n_shot=5, - n_query=5, - n_tasks=self.n_task_train, - transform=self.transform) + train_loader = few_shot_dataloader( + self.root_dir_train, + self.train_set, + n_way=5, + n_shot=5, + n_query=5, + n_tasks=self.n_task_train, + transform=self.transform, + ) return train_loader def val_dataloader(self): - - val_loader = few_shot_dataloader(self.root_dir_val, - self.val_set, - n_way=5, - n_shot=3, - n_query=2, - n_tasks=self.n_task_val, - transform=self.transform) + val_loader = few_shot_dataloader( + self.root_dir_val, + self.val_set, + n_way=5, + n_shot=3, + n_query=2, + n_tasks=self.n_task_val, + transform=self.transform, + ) return val_loader - - def test_dataloader(self): - test_loader = few_shot_dataloader(self.root_dir_test, - self.test_set, - n_way=5, - n_shot=5, - n_query=20, - n_tasks=self.n_task_test, - transform=self.transform) + def test_dataloader(self): + test_loader = few_shot_dataloader( + self.root_dir_test, + self.test_set, + n_way=5, + n_shot=5, + n_query=20, + n_tasks=self.n_task_test, + transform=self.transform, + ) return test_loader - - diff --git a/prototypicalbeats/prototraining.py b/prototypicalbeats/prototraining.py index 4acf639..a09127e 100644 --- a/prototypicalbeats/prototraining.py +++ b/prototypicalbeats/prototraining.py @@ -16,6 +16,7 @@ from Models.baseline import ProtoNet from Models.pann import Cnn14 + class ProtoBEATsModel(pl.LightningModule): def __init__( self, @@ -24,12 +25,12 @@ def __init__( lr: float = 1e-5, lr_scheduler_gamma: float = 1e-1, num_workers: int = 6, - model_type: str = "baseline", + model_type: str = "baseline", model_path: str = None, - distance: str = "euclidean", - specaugment_params = None, + distance: str = "euclidean", + specaugment_params=None, state: str = None, - beats_path: str = "/data/models/BEATs/BEATs_iter3_plus_AS2M.pt", + beats_path: str = "/data/BEATs/BEATs_iter3_plus_AS2M.pt", **kwargs, ) -> None: """TransferLearningModel. @@ -45,19 +46,18 @@ def __init__( self.distance = distance self.model_type = model_type self.state = state - self.specaugment_params = specaugment_params + self.specaugment_params = specaugment_params self.beats_path = beats_path - if model_path != "None": + if model_path != "None": self.checkpoint = torch.load(model_path) if self.state == "validate": - self.adjusted_state_dict= OrderedDict() + self.adjusted_state_dict = OrderedDict() for k, v in self.checkpoint["state_dict"].items(): # Check if the key starts with 'module.' and remove it only then - name = k[6:] if k.startswith('model.') else k + name = k[6:] if k.startswith("model.") else k self.adjusted_state_dict[name] = v - - + self._build_model() self.save_hyperparameters() @@ -76,7 +76,7 @@ def _build_model(self): print("LOADING THE FINE-TUNED MODEL") self.model.load_state_dict(self.adjusted_state_dict, strict=True) - #else: + # else: # print("NOT LOADING ANY FINE-TUNED MODEL") # self.model = self.model @@ -106,22 +106,24 @@ def _build_model(self): if self.state == "train": layers_to_remove = [ - "spectrogram_extractor.stft.conv_real.weight", - "spectrogram_extractor.stft.conv_imag.weight", + "spectrogram_extractor.stft.conv_real.weight", + "spectrogram_extractor.stft.conv_imag.weight", "logmel_extractor.melW", - "fc_audioset.weight", "fc_audioset.bias"] + "fc_audioset.weight", + "fc_audioset.bias", + ] for key in layers_to_remove: del self.checkpoint["model"][key] print("LOADING AUDIOSET PRE-TRAINED MODEL") self.model.load_state_dict(self.checkpoint["model"]) - if self.state == "validate": + if self.state == "validate": print("LOADING THE FINE-TUNED MODEL") - self.model.load_state_dict(self.adjusted_state_dict, strict=True) + self.model.load_state_dict(self.adjusted_state_dict, strict=True) def euclidean_distance(self, x1, x2): return torch.sqrt(torch.sum((x1 - x2) ** 2, dim=1)) - + def mahalanobis_distance(self, query, z_support, support_labels, n_way, eps=1e-3): z_proto = self.get_prototypes(z_support, support_labels, n_way) @@ -131,44 +133,53 @@ def mahalanobis_distance(self, query, z_support, support_labels, n_way, eps=1e-3 covs = [] for label in range(n_way): z_support_class = z_support_copy[support_labels == label] - cov = torch.matmul(z_support_class[:, :, 1].transpose(0, 1), z_support_class[:, :, 1]) / (z_support_class.shape[0] - 1) + cov = torch.matmul( + z_support_class[:, :, 1].transpose(0, 1), z_support_class[:, :, 1] + ) / (z_support_class.shape[0] - 1) cov_reg = cov + torch.eye(cov.shape[1]).unsqueeze(0).to("cuda") * eps cov_inv = torch.pinverse(cov_reg) covs.append(cov_inv) - covs_inv = torch.stack(covs).to("cuda") # Shape: [n_way, embedding2, embedding2] + covs_inv = torch.stack(covs).to( + "cuda" + ) # Shape: [n_way, embedding2, embedding2] delta = query_copy[:, :, 1] - z_proto[:, :, 1] # Shape: [1, embedding2] delta_t = delta.unsqueeze(1) # Shape: [1, 1, embedding2] - d_squared = torch.matmul(torch.matmul(delta_t, covs_inv), delta_t.transpose(1, 2)) # Shape: [1, 1, 1] + d_squared = torch.matmul( + torch.matmul(delta_t, covs_inv), delta_t.transpose(1, 2) + ) # Shape: [1, 1, 1] d = torch.sqrt(d_squared.squeeze()) # Shape: [1] return d.squeeze() - + def get_prototypes(self, z_support, support_labels, n_way): - z_proto = torch.cat([ + z_proto = torch.cat( + [ z_support[torch.nonzero(support_labels == label)].mean(0) for label in range(n_way) - ]) + ] + ) return z_proto - + def get_embeddings(self, input, padding_mask): """Return the embeddings and the padding mask""" return self.model.extract_features(input, padding_mask) - def forward(self, - support_images: torch.Tensor, - support_labels: torch.Tensor, - query_images: torch.Tensor, - padding_mask=None): - + def forward( + self, + support_images: torch.Tensor, + support_labels: torch.Tensor, + query_images: torch.Tensor, + padding_mask=None, + ): # Extract the features of support and query images if self.model_type == "beats": z_support, _ = self.get_embeddings(support_images, padding_mask) z_query, _ = self.get_embeddings(query_images, padding_mask) - else: + else: z_support = self.get_embeddings(support_images, padding_mask) z_query = self.get_embeddings(query_images, padding_mask) @@ -187,14 +198,18 @@ def forward(self, dists.append(q_dists) elif self.distance == "mahalanobis": for q in z_query: - q_dists = self.mahalanobis_distance(q.unsqueeze(0), z_support, support_labels, n_way) + q_dists = self.mahalanobis_distance( + q.unsqueeze(0), z_support, support_labels, n_way + ) dists.append(q_dists) else: - print("The distance provided is not implemented. Distance can be either euclidean or mahalanobis") - + print( + "The distance provided is not implemented. Distance can be either euclidean or mahalanobis" + ) + dists = torch.stack(dists, dim=0) - - # We drop the last dimension without changing the gradients + + # We drop the last dimension without changing the gradients if self.model_type == "beats": dists = dists.mean(dim=2).squeeze() @@ -214,13 +229,17 @@ def training_step(self, batch, batch_idx): ) # 2. Compute loss - train_loss = self.loss(classification_scores.requires_grad_(True), query_labels) + train_loss = self.loss(classification_scores.requires_grad_(True), query_labels) self.log("train_loss", train_loss, prog_bar=True) # 3. Compute accuracy: predicted_labels = torch.max(classification_scores, 1)[1] - self.log("train_acc", self.train_acc(predicted_labels, query_labels), prog_bar=True) - self.log("train_f1", self.train_f1(predicted_labels, query_labels), prog_bar=True) + self.log( + "train_acc", self.train_acc(predicted_labels, query_labels), prog_bar=True + ) + self.log( + "train_f1", self.train_f1(predicted_labels, query_labels), prog_bar=True + ) return train_loss @@ -232,16 +251,21 @@ def validation_step(self, batch, batch_idx): ) # 2. Compute loss - self.log("val_loss", self.loss(classification_scores, query_labels), prog_bar=True) + self.log( + "val_loss", self.loss(classification_scores, query_labels), prog_bar=True + ) # 3. Compute accuracy: predicted_labels = torch.max(classification_scores, 1)[1] - self.log("val_acc", self.valid_acc(predicted_labels, query_labels), prog_bar=True) - self.log("valid_f1", self.valid_f1(predicted_labels, query_labels), prog_bar=True) + self.log( + "val_acc", self.valid_acc(predicted_labels, query_labels), prog_bar=True + ) + self.log( + "valid_f1", self.valid_f1(predicted_labels, query_labels), prog_bar=True + ) def configure_optimizers(self): optimizer = optim.AdamW( - self.model.parameters(), - lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 + self.model.parameters(), lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01 ) return optimizer